From 9afd31e66ddbe6ecb2206c4d2edf72b7a653702c Mon Sep 17 00:00:00 2001
From: Damian Stachura <damian.stachura@evidenceprime.com>
Date: Wed, 11 Feb 2026 18:25:16 +0100
Subject: [PATCH 1/2] Migrated HELM leaderboards into schema v0.2

---
 ...bd982107-7c03-4ee8-8a38-782d68883818.json} |  90 +-
 ...25aa6e41-ab16-4f63-9613-bfb83b9151c5.json} |  90 +-
 ...ddd52881-1248-4652-9f1d-5d2b58ede889.json} |  90 +-
 ...365bc693-73b6-41fe-a8fa-eba7b91febe0.json} |  90 +-
 ...a126b881-918a-411a-90e9-32d7b63d1e00.json} |  84 +-
 ...b8e54bb1-0768-4558-8dc2-4897d4e571aa.json} |  84 +-
 ...a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json} |  84 +-
 ...2413b504-7125-461b-ae9d-0c58211a5358.json} |  84 +-
 ...f350d9d1-b743-4017-bc68-a4dc726515d0.json} |  86 +-
 ...c32a1f0a-bf8a-42be-b155-4f87465235bc.json} |  86 +-
 ...96cfde1b-77de-4d2a-8b45-938116795108.json} |  86 +-
 .../56c180e5-45aa-4106-8f92-c6566c3c7dfc.json | 345 +++++++
 ...d633fcd6-eb01-49ff-ba7c-6ca12734746f.json} |  86 +-
 ...7a7b49ff-5060-4d12-acb9-607125fbe081.json} |  86 +-
 ...287a3646-d969-4bd9-9773-86463c1ba87f.json} |  86 +-
 ...97f3892f-9588-49ef-abef-3a0c965bb352.json} |  86 +-
 .../22ba68b0-6eec-47f2-b465-47f298e8da09.json | 345 +++++++
 ...9e5684dc-6380-4353-b966-7205d66340fa.json} |  84 +-
 ...1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json} |  84 +-
 ...20512a3b-ac0f-483a-8bec-9962980c579c.json} |  86 +-
 ...704c5c74-a0ee-457d-9b4e-3ae895ffc105.json} |  86 +-
 ...eb9224b8-0edb-4605-a2ee-cfb63f41370e.json} |  84 +-
 ...4cb58f80-c2b1-45c6-b781-19af47660eb0.json} |  86 +-
 ...6307e0c4-c983-4257-82d8-b2a50171eb8a.json} |  84 +-
 ...275cd615-bddf-4afe-a499-b463fe183486.json} |  86 +-
 ...03b48360-a387-44ba-94b2-2eb7c234a9fa.json} |  86 +-
 .../3a242fb8-07f9-460e-93eb-345aab0f994f.json | 345 +++++++
 ...5e5720d0-67fe-40a9-b65b-d4154848d83c.json} |  84 +-
 .../9c9239df-0cbb-411f-af40-1b2782f91255.json | 345 +++++++
 .../e1d12d96-185f-493e-bb08-8237623fb736.json | 345 +++++++
 ...aba1fded-b031-48df-87ef-dc744df33501.json} |  90 +-
 ...98f69aa6-b227-4076-a76e-1293cbe1c6cb.json} |  86 +-
 ...d2bb087e-a275-4fce-b6dc-001fd4545883.json} |  86 +-
 ...84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json} |  86 +-
 ...23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json} |  86 +-
 ...9cab3a77-4f32-48d0-ba11-e2323ccc4861.json} |  86 +-
 ...9e037c92-1253-49be-b31a-3aa017531d77.json} |  86 +-
 ...bd26c7cb-ce76-4b17-b617-d1d93a168c93.json} |  86 +-
 ...9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json} |  86 +-
 ...d69a1cbe-353c-4be9-b93b-5224d24c7adf.json} |  86 +-
 ...915cb39d-f21f-4ef1-a95f-f44f79ede893.json} |  86 +-
 ...fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json} |  84 +-
 ...eb51f418-6abf-4b2c-9f57-0b830c00bd15.json} |  86 +-
 ...41cd14b0-46ba-49da-844a-19fe866bef1e.json} |  86 +-
 ...7de93642-a4bc-430b-8733-9befeb6a0e23.json} |  86 +-
 ...4f18292a-1fef-4feb-9b17-045c96e3e137.json} |  86 +-
 ...7458c032-b24d-4f13-a659-b6e19d19a8e1.json} |  86 +-
 ...21eb1648-aad0-4297-9edc-c445e4c38694.json} |  86 +-
 ...99d657ae-e850-4caf-a599-13f1b8072273.json} |  86 +-
 ...10cd766e-442c-4b3d-833b-740417d9a6d9.json} |  86 +-
 .../bc6124a7-89df-4c3e-b824-56c948d1eeb5.json | 345 +++++++
 ...06719cd4-5654-49b6-9dee-e112d1601d1c.json} |  84 +-
 ...ed849999-48c2-4569-8bcd-dc73084e3197.json} |  84 +-
 ...01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json} |  86 +-
 ...32382d69-21c7-43a9-bb95-27607ec18cc9.json} |  86 +-
 ...77e702f7-37ef-4487-b047-74b13ef6d966.json} |  86 +-
 ...4ee3c647-740c-41a6-ac66-4a38b09317ff.json} |  86 +-
 ...ca30726a-00a6-4228-94fe-5dce00de1d5e.json} |  84 +-
 ...7862890a-298b-4bda-b8f1-7be6a5779365.json} |  84 +-
 .../8c73a09f-ba0d-4c12-a12a-776a17292151.json | 345 +++++++
 ...442aed0d-95c3-4436-ad63-b7b1e93307f4.json} |  84 +-
 ...7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json} |  84 +-
 ...bc2c91e0-6afd-4e44-b665-d5c7558f8981.json} |  84 +-
 ...a74b74f7-ccce-4341-a122-26728cc6bece.json} |  84 +-
 ...87811b75-afe8-413b-949d-7fd1f582a2e8.json} |  84 +-
 ...ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json} |  84 +-
 ...924080a0-c530-4e6d-b1a4-107de3bd7183.json} |  86 +-
 ...be23c720-a99a-4945-bc0b-ddc27c8eec39.json} |  84 +-
 ...425d4a41-2def-4581-9b61-ee33ecb3a822.json} | 179 +++-
 ...c12a8494-bafc-4097-874a-7c00636e96f8.json} | 175 +++-
 ...4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json} | 175 +++-
 ...19f61327-fcc3-408f-9254-2d6a2aadcd4e.json} | 175 +++-
 ...ccc17d56-bd26-409c-ac3f-262eaba9ce21.json} | 175 +++-
 ...f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json} | 175 +++-
 ...9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json} | 175 +++-
 ...f25c142c-8730-4241-a649-01d076e1f28d.json} | 175 +++-
 ...ab34f23e-36db-40c0-9681-f30b00692f98.json} | 175 +++-
 ...67281534-a03d-49d8-a586-25cb1a03134e.json} | 175 +++-
 ...3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json} | 175 +++-
 ...04ce2ba4-c382-4658-ba06-1def9499a243.json} | 175 +++-
 ...3a546396-d031-4958-8410-00e0d3406089.json} | 175 +++-
 ...e7b99aa6-08e8-4224-a805-16586eb44325.json} | 175 +++-
 ...43a3fe19-929a-463d-a0ed-791dad765188.json} | 175 +++-
 ...75468958-b75b-41fe-9813-070b793e86d9.json} | 175 +++-
 ...6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json} | 175 +++-
 ...3c9c425a-ce4a-4958-9744-7f9490ed5729.json} | 175 +++-
 ...5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json} | 175 +++-
 ...8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json} | 175 +++-
 ...f8044c74-3f1c-4562-a21c-e448061b2077.json} | 175 +++-
 ...4abe3a0d-ba04-41f7-b107-59f11ff5697a.json} | 179 +++-
 ...646adb7b-0761-4639-8776-83ea158bfca4.json} | 179 +++-
 ...85cf6be2-d066-4e1b-b373-d53d3c922184.json} | 179 +++-
 ...52db5c6d-b54e-401a-880d-8ab41a394bc4.json} | 175 +++-
 ...68becad6-9455-4d3d-8d68-d1b4448598a1.json} | 175 +++-
 ...519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json} | 175 +++-
 ...972bc5db-f536-42f9-aa51-83cc2f59b76a.json} | 175 +++-
 ...b2220101-56e0-49d9-a3d1-d3bec769ab97.json} | 175 +++-
 ...96907b25-05c3-441b-afc4-69274c20bfc3.json} | 175 +++-
 ...66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json} | 175 +++-
 ...70e9e156-6807-489b-b77a-367236614826.json} | 175 +++-
 ...e90cfb46-1173-4d22-9329-9bf57cdd5241.json} | 175 +++-
 ...baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json} | 175 +++-
 ...7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json} | 175 +++-
 ...ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json} | 175 +++-
 ...26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json} | 175 +++-
 ...ecd21c26-cdc4-43b1-b933-4d970df9413a.json} | 175 +++-
 ...9d4350eb-cdf0-432f-b3b0-45f4832ca950.json} | 175 +++-
 ...3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json} | 179 +++-
 ...b277c87e-54b5-466f-97d7-35db4cd7b985.json} | 175 +++-
 ...270df23b-9e58-4259-a8ed-0d25b9c80b2a.json} | 175 +++-
 ...1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json} | 179 +++-
 ...ef171b67-72a6-46d3-9eaf-4614ff474852.json} | 179 +++-
 ...e6ea5f7e-0533-4a99-8638-1cc10c454238.json} | 175 +++-
 ...83c924fe-6318-4bad-adb0-8a81e5e28ee0.json} | 175 +++-
 ...82e2c0e3-66f2-431f-b4b8-d2495970d998.json} | 175 +++-
 ...6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json} | 175 +++-
 ...e18fbf9e-677c-49fb-ab76-475e8f605f01.json} | 175 +++-
 ...039af363-0c5c-4e36-8396-cd57c7e4c1de.json} | 175 +++-
 ...8ea1facb-260a-461d-9271-2c07b318c46f.json} | 175 +++-
 ...93007ac9-04c2-451d-abd2-2f235297747e.json} | 175 +++-
 ...b04e5f90-e46e-4d7a-a6a9-569bde072208.json} | 175 +++-
 ...933dc76f-45f0-48e0-93ae-3e19cff87c2a.json} | 175 +++-
 ...b8408a64-eb89-4337-8ee5-3c48e4e24437.json} | 175 +++-
 ...d5846321-0800-4ff9-b85c-53c8b4884ba5.json} | 175 +++-
 ...baa5f92c-b626-4e09-a084-61ce7f5dee98.json} | 179 +++-
 ...9b648e90-8d3c-403d-9ad8-382ef0b212a6.json} | 179 +++-
 ...0692f762-337e-4c20-8ad6-feecc93882a3.json} | 179 +++-
 ...a91c9563-0756-4616-8a58-3c8000f73895.json} | 179 +++-
 ...3a329574-dcf6-4177-b37c-c495e6af6cc5.json} | 175 +++-
 ...9e662c1e-e77c-4fb3-b589-127683a4b2ca.json} | 175 +++-
 ...375140f6-bd3f-4b55-a35c-23de37254296.json} | 175 +++-
 ...021d0b25-8f58-47da-a58c-ac532a7972bf.json} | 175 +++-
 ...9207fec4-d0c4-4f66-b917-f5ed57409215.json} | 175 +++-
 ...b04c8845-cccf-4856-9597-ab283bb2ec8d.json} | 175 +++-
 ...4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json} | 175 +++-
 ...0e30e895-aaf7-42d4-95db-7541d6b41c87.json} | 183 ++--
 ...4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json} | 185 ++--
 ...8befd29c-a16d-4e05-a92f-00b621d45e03.json} | 185 ++--
 ...b2e193b8-215b-4e80-9d5a-df11f1dac88a.json} | 185 ++--
 ...eedd0f38-6d26-4297-a469-291227ec6be6.json} | 184 +++-
 ...74c47665-740f-4784-8a27-1c1d1c29bff8.json} | 184 +++-
 ...8027b577-7f48-4df5-9879-bd45ac342f42.json} | 184 +++-
 ...e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json} | 184 +++-
 ...24e11e7b-15d6-4a09-9545-38486d0eb236.json} | 184 +++-
 ...eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json} | 184 +++-
 ...52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json} | 184 +++-
 ...68713712-ae92-474b-84c0-1b8301538439.json} | 184 +++-
 ...15cc9411-6ea4-4f10-831f-23ff27fd5704.json} | 182 +++-
 ...3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json} | 182 +++-
 ...1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json} | 180 +++-
 ...078d812b-2198-4497-8fbe-06fb640fd86d.json} | 184 +++-
 ...f928a53d-9d67-45e7-a871-04359c8162d5.json} | 184 +++-
 ...741c4560-eb35-4edf-a48b-af29e743740a.json} | 184 +++-
 ...4e8a8384-5f1d-4b76-be9b-385407332d6c.json} | 184 +++-
 ...0684c1d2-ea43-4341-820c-09051f5e11f2.json} | 182 +++-
 ...51821ca1-7eac-4094-abac-98b2484cc5a0.json} | 182 +++-
 ...8a0f5749-7f6a-4813-9c08-7283433c1337.json} | 186 ++--
 ...4697983d-a29a-484d-9268-7974117456e8.json} | 184 +++-
 ...60e33aa3-0593-42e6-9baa-8311746deca0.json} | 184 +++-
 ...2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json} | 184 +++-
 ...9ad91ee2-7a64-4f94-9166-f2681777023b.json} | 184 +++-
 ...4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json} | 184 +++-
 ...64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json} | 182 +++-
 ...fe8a36b0-4361-461b-b310-656c54131fa6.json} | 182 +++-
 ...b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json} | 182 +++-
 ...67967a2a-5fb4-46e8-b1ec-eda1588d9086.json} | 182 +++-
 ...0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json} | 182 +++-
 ...ba5eea81-2120-4a20-8322-dfbd29cd197c.json} | 182 +++-
 ...9dd66ede-da5c-4627-92ed-7057c9a2bea3.json} | 182 +++-
 ...801aa7da-90b2-48d1-ad3d-943b06bd437c.json} | 184 +++-
 ...a58923ea-fa22-4c45-8327-efbe84c8a05d.json} | 182 +++-
 ...bab8d241-fad0-4230-b213-c2eeccc79f12.json} | 184 +++-
 ...65e37589-ef26-46cd-a627-798af70e75bf.json} | 184 +++-
 ...f499f9c6-4c9a-43ba-b4c3-d094494a371c.json} | 184 +++-
 ...27a54446-57b2-4239-b768-7ab85dc94c54.json} | 184 +++-
 ...5de8a13e-a029-4a90-9a2d-c28a59212140.json} | 184 +++-
 ...f9643ce2-7347-401b-903e-fadcc5221f36.json} | 186 ++--
 ...9932e430-2039-40b0-bc8f-ae2d833543e8.json} | 184 +++-
 ...dbd2e9bb-c2ca-4165-b229-d736a70721a5.json} | 184 +++-
 ...32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json} | 184 +++-
 ...70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json} | 184 +++-
 ...07a367ee-2879-4ede-bbf8-33b24d682467.json} | 184 +++-
 ...fee914c7-d6bf-4d61-9f50-71bae5f11006.json} | 184 +++-
 ...b0577066-231e-461b-bae8-b724b204397a.json} | 184 +++-
 ...b79fe2e3-5eec-46f8-90a1-810781c8c46a.json} | 184 +++-
 ...998616ef-5d1b-4c65-b6ad-23afc3630d5a.json} | 184 +++-
 ...fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json} | 184 +++-
 ...25fde5e6-86b8-4a80-8f79-5946ef9999fc.json} | 184 +++-
 ...b955825d-ae7f-48c4-9dad-5ee78879737d.json} | 184 +++-
 ...168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json} | 184 +++-
 ...0807e353-9787-4ca0-8f7b-50d1bed2469e.json} | 184 +++-
 ...0164b885-2c27-4eba-8e6f-e69156cb0dee.json} | 184 +++-
 ...08422837-51a0-45c9-9004-fc5d98dce462.json} | 184 +++-
 ...39f2c7f2-56d4-4349-95ae-374d34263f48.json} | 184 +++-
 ...0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json} | 182 +++-
 ...75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json} | 184 +++-
 ...2de4b89a-3f3b-4d1d-ba85-030953a46956.json} | 184 +++-
 ...bd68405f-fe9a-448b-9c80-468c656594e5.json} | 184 +++-
 ...4267fef1-3180-46e3-990e-0d1092ec4c18.json} | 184 +++-
 ...002a34dc-39e5-451d-b2a8-b51bdb69a056.json} | 184 +++-
 ...5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json} | 184 +++-
 ...ad2beded-cec3-4b47-b8de-a32a3225fa66.json} | 184 +++-
 ...eb901347-fc1f-4d8f-a70a-05a83e16658d.json} | 184 +++-
 ...9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json} | 184 +++-
 ...042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json} | 184 +++-
 ...d2d48e4a-0484-4f44-8108-2e689d7ca695.json} | 184 +++-
 ...e54ae605-a91d-47d7-a08d-67bd0ea5c606.json} | 184 +++-
 ...15dccf75-871d-457b-8495-e0d03d550360.json} | 184 +++-
 ...18fe5d30-bf36-405a-819e-1ecabda327ea.json} | 184 +++-
 ...cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json} | 184 +++-
 ...cd199905-04a4-4745-b848-4f7bde97ca17.json} | 184 +++-
 ...1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json} | 184 +++-
 ...bfd70aff-bf45-4f55-b730-4924afc181cd.json} | 184 +++-
 ...b6e08679-1bd7-42a1-9eee-98252de2c7c1.json} | 184 +++-
 ...22b411d5-a314-4b17-a9c7-c1af7ca7df33.json} | 184 +++-
 ...f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json} | 184 +++-
 ...fb1bb023-16f6-4914-889b-6458d7ab1277.json} | 184 +++-
 ...8b572c10-3553-4e51-a321-bdb05996914b.json} | 184 +++-
 ...6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json} | 184 +++-
 ...e0efe169-d28e-418e-a78c-9b04ec29aae2.json} | 184 +++-
 ...05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json} | 184 +++-
 ...983696ae-d7f3-48a4-b7a0-a42487728182.json} | 186 ++--
 ...a969e516-adef-4839-9252-244c58ab3c67.json} | 186 ++--
 ...f122f9de-b1ce-40ea-8731-6c00c7af0498.json} | 182 +++-
 ...5c7982c5-3513-4ff2-9857-33a0db825376.json} | 184 +++-
 ...4910859a-750c-4728-bf30-309e0e81690e.json} | 184 +++-
 ...32f0532f-b504-492d-84d7-f541930edad0.json} | 182 +++-
 ...04c187a3-4532-4523-b39d-19314d61c779.json} | 190 ++--
 ...4440532c-9b49-4c9a-8bf4-f122531c54fa.json} | 184 +++-
 ...bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json} | 184 +++-
 ...3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json} | 850 ++++++++++++------
 ...6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json} | 850 ++++++++++++------
 ...3d0b3d68-a853-4989-a35e-83ac6722c2da.json} | 850 ++++++++++++------
 ...ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json} | 848 +++++++++++------
 ...517e8027-6edd-482b-86f3-33b6c41a9609.json} | 848 +++++++++++------
 ...f7c1c125-ad0f-4847-b880-4f705f1666c6.json} | 848 +++++++++++------
 ...5a0ba280-8a12-4735-9d92-4ed71ba395b4.json} | 850 ++++++++++++------
 ...73ccc6a6-e10d-4619-914f-26032cddf8da.json} | 850 ++++++++++++------
 ...20c5af59-ff73-4731-9230-f92bb86e657b.json} | 848 +++++++++++------
 ...fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json} | 848 +++++++++++------
 ...d30617fc-8d64-4070-b86a-c982025cfcea.json} | 848 +++++++++++------
 ...aa8cae95-cb75-4241-951c-25e2046042dd.json} | 848 +++++++++++------
 ...c88e4a03-22ae-4338-bf5f-36070814136a.json} | 850 ++++++++++++------
 ...4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json} | 850 ++++++++++++------
 ...ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json} | 850 ++++++++++++------
 ...097a8da1-f411-4359-8440-2ab06f4ae76c.json} | 850 ++++++++++++------
 ...68130abd-1df5-4cd3-919a-2863e9f013c7.json} | 850 ++++++++++++------
 ...5d8d795a-d213-4b96-9b17-ad5fae6b3687.json} | 850 ++++++++++++------
 ...7908da03-f030-4c62-a121-c04bd94ea75e.json} | 848 +++++++++++------
 ...c6fdbf96-2500-4410-8fcd-268ea3e16062.json} | 848 +++++++++++------
 ...537164c3-7b88-4543-b19d-370f55a25a66.json} | 848 +++++++++++------
 ...0c539e26-8403-42db-acfc-7953dd80ae20.json} | 848 +++++++++++------
 ...364c7490-8bb1-4e7e-b485-fb3c2224da58.json} | 850 ++++++++++++------
 ...1a9167d2-882c-4582-b4e0-ac425896a317.json} | 848 +++++++++++------
 ...8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json} | 850 ++++++++++++------
 ...d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json} | 850 ++++++++++++------
 ...a94c9e13-dca7-4e02-a795-09d9274354d3.json} | 850 ++++++++++++------
 ...75c8b20f-a4d4-4699-be79-f027bf7f0d69.json} | 850 ++++++++++++------
 ...264be7b4-08b7-40b6-a5e7-f3536f361450.json} | 850 ++++++++++++------
 ...83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json} | 850 ++++++++++++------
 ...8a013eb3-0f21-4a50-8a53-4ba977951130.json} | 850 ++++++++++++------
 ...7b081a40-7cb6-4405-b842-3db95f290dfa.json} | 850 ++++++++++++------
 ...54185b53-9891-43c6-8f93-09ff02b728d8.json} | 850 ++++++++++++------
 ...884c194d-6519-4bd4-8add-6514e593c514.json} | 850 ++++++++++++------
 ...a80cbd76-bcf8-4174-b0b3-346fae152bdb.json} | 850 ++++++++++++------
 ...5f105986-aa7d-4858-91bc-cece9d0085ba.json} | 850 ++++++++++++------
 ...528b7b4e-c8a6-4387-bd98-497a3316029d.json} | 850 ++++++++++++------
 ...96eb34db-66bd-4945-8b4c-a8c1394fe56a.json} | 850 ++++++++++++------
 ...961e917b-0e67-462c-b9d0-0fe4b4b85beb.json} | 850 ++++++++++++------
 ...59a85d2c-16ce-4ed4-bc65-f6898127fa57.json} | 850 ++++++++++++------
 ...16a8b446-51fc-4c23-9231-46ee16c1c0a8.json} | 850 ++++++++++++------
 ...f4de7e58-7060-440b-8f6f-1f79d7499d1e.json} | 850 ++++++++++++------
 ...5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json} | 850 ++++++++++++------
 ...dc6aa933-67e4-4811-b3e2-e5200c002abe.json} | 850 ++++++++++++------
 ...5f9758a3-fd6d-4598-930a-9c01420d05e8.json} | 850 ++++++++++++------
 ...7592c0d8-a06c-4189-81a1-dbf794d22c8b.json} | 850 ++++++++++++------
 ...83c0e8e3-087c-4d61-9153-e571b4971871.json} | 850 ++++++++++++------
 ...c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json} | 850 ++++++++++++------
 ...5baac093-babb-41cd-a2f4-985d0b91be37.json} | 848 +++++++++++------
 ...1bf54088-ba12-45b4-8f80-63d5c38f58f6.json} | 850 ++++++++++++------
 ...5ed0a970-200f-4f23-9623-e714afa49ddf.json} | 850 ++++++++++++------
 ...e7fd06a6-65e5-4f88-8e86-c513f78e31db.json} | 850 ++++++++++++------
 ...ac047aef-008f-4c87-a6d5-4f331ebf5c53.json} | 850 ++++++++++++------
 ...ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json} | 850 ++++++++++++------
 ...7517b6c9-c613-416c-aadb-39fd6d252da7.json} | 850 ++++++++++++------
 ...85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json} | 850 ++++++++++++------
 ...df568c3c-8a5c-4455-836d-c980d7f5ea5c.json} | 850 ++++++++++++------
 ...96e24977-ca6d-402c-bfd8-62be4cd9b902.json} | 850 ++++++++++++------
 ...e5b2636a-8438-40c0-9f89-9f35585bf740.json} | 850 ++++++++++++------
 ...f3259d92-3c95-4b78-81ae-f7f4b80aec63.json} | 850 ++++++++++++------
 ...5ba23a34-4232-487f-b3e9-326d776135be.json} | 850 ++++++++++++------
 ...5bc1a462-f753-4259-91c3-a549491b2986.json} | 850 ++++++++++++------
 ...16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json} | 850 ++++++++++++------
 ...dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json} | 850 ++++++++++++------
 ...2ca11d4c-52e6-49ea-a5cb-238c0313c483.json} | 850 ++++++++++++------
 ...de400624-6c2e-47af-b851-54c4075c30ee.json} | 850 ++++++++++++------
 ...34441b3b-4d66-444c-af85-ca0666a48ed4.json} | 850 ++++++++++++------
 ...eecf5e40-9110-47ea-a72b-9ba587b96e30.json} | 850 ++++++++++++------
 ...f26fb123-c214-4d18-aea8-b05b4ea1819b.json} | 850 ++++++++++++------
 ...30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json} | 850 ++++++++++++------
 ...b152cd5c-cbc0-48f4-ba37-16878c3afba1.json} | 850 ++++++++++++------
 ...dac223e9-3073-46f9-924b-c5a6408f5da9.json} | 850 ++++++++++++------
 ...a7a218ff-7afe-417c-ac39-cf305d592d56.json} | 850 ++++++++++++------
 ...2e165735-43b8-4317-9cde-35aa4b5bcb26.json} | 850 ++++++++++++------
 ...15c25bc5-7b1e-4771-bda2-fd04d74e1463.json} | 850 ++++++++++++------
 ...26036c7c-e981-46e8-b5e9-dcd7d116af70.json} | 848 +++++++++++------
 ...b3269e4e-98a7-4795-8ef3-fc87774a54b7.json} | 848 +++++++++++------
 ...284fde9f-8570-4e6d-9190-e52d8723fe57.json} | 848 +++++++++++------
 ...fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json} | 850 ++++++++++++------
 scripts/HELM/parse_helm_leaderboards.sh       |   9 +
 utils/helm/adapter.py                         |  56 +-
 311 files changed, 75832 insertions(+), 28221 deletions(-)
 rename data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/{18881f8b-b06e-4317-b697-6eadb975077c.json => bd982107-7c03-4ee8-8a38-782d68883818.json} (80%)
 rename data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/{97db1a8d-b7d8-4481-82fb-dc0c6396edac.json => 25aa6e41-ab16-4f63-9613-bfb83b9151c5.json} (80%)
 rename data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/{8d29f447-01d8-4fae-87d5-b4386ce5239a.json => ddd52881-1248-4652-9f1d-5d2b58ede889.json} (80%)
 rename data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/{53090373-ea82-4b63-83fd-f1d48f0400cd.json => 365bc693-73b6-41fe-a8fa-eba7b91febe0.json} (80%)
 rename data/helm_capabilities/amazon/{nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json => nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json} (80%)
 rename data/helm_capabilities/amazon/{nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json => nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json} (80%)
 rename data/helm_capabilities/amazon/{nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json => nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json} (81%)
 rename data/helm_capabilities/amazon/{nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json => nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json} (80%)
 rename data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/{568969ac-4b9a-42b0-8374-2b28dde30a3c.json => f350d9d1-b743-4017-bc68-a4dc726515d0.json} (80%)
 rename data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/{c6b92f00-6335-463d-87db-817ff85f36c8.json => c32a1f0a-bf8a-42be-b155-4f87465235bc.json} (80%)
 rename data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/{460fdbd2-a164-4af4-95ff-db66e381ca0c.json => 96cfde1b-77de-4d2a-8b45-938116795108.json} (80%)
 create mode 100644 data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json
 rename data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/{cb21169b-04ff-47d1-92dd-5b5f2e09b863.json => d633fcd6-eb01-49ff-ba7c-6ca12734746f.json} (80%)
 rename data/helm_capabilities/anthropic/claude-opus-4-20250514/{2168d830-ad6b-4aee-94f0-7ec8fd403a49.json => 7a7b49ff-5060-4d12-acb9-607125fbe081.json} (80%)
 rename data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/{a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json => 287a3646-d969-4bd9-9773-86463c1ba87f.json} (80%)
 rename data/helm_capabilities/anthropic/claude-sonnet-4-20250514/{629d5de7-25ed-4088-aca6-7fb53719f4a4.json => 97f3892f-9588-49ef-abef-3a0c965bb352.json} (80%)
 create mode 100644 data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json
 rename data/helm_capabilities/deepseek-ai/deepseek-r1-0528/{fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json => 9e5684dc-6380-4353-b966-7205d66340fa.json} (81%)
 rename data/helm_capabilities/deepseek-ai/deepseek-v3/{d031935b-2b54-4940-a852-dad1f10fc396.json => 1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json} (81%)
 rename data/helm_capabilities/google/gemini-1.5-flash-002/{b79010aa-d441-4850-b656-52ce6587dab8.json => 20512a3b-ac0f-483a-8bec-9962980c579c.json} (80%)
 rename data/helm_capabilities/google/gemini-1.5-pro-002/{dde5a36d-f14b-482d-86db-74bdb3771e38.json => 704c5c74-a0ee-457d-9b4e-3ae895ffc105.json} (80%)
 rename data/helm_capabilities/google/gemini-2.0-flash-001/{981ba423-a1d2-4577-9f61-9c4b8b430b58.json => eb9224b8-0edb-4605-a2ee-cfb63f41370e.json} (81%)
 rename data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/{56ddcce9-fc1c-476f-96c8-65a7d732c95b.json => 4cb58f80-c2b1-45c6-b781-19af47660eb0.json} (80%)
 rename data/helm_capabilities/google/gemini-2.5-flash-lite/{22da4909-8b3b-49f3-940f-8764509725f8.json => 6307e0c4-c983-4257-82d8-b2a50171eb8a.json} (81%)
 rename data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/{a6b3d596-d204-4cb7-a3e4-4e717537b76a.json => 275cd615-bddf-4afe-a499-b463fe183486.json} (80%)
 rename data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/{eaa18be0-1195-4344-9673-efa8c555456d.json => 03b48360-a387-44ba-94b2-2eb7c234a9fa.json} (80%)
 create mode 100644 data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json
 rename data/helm_capabilities/ibm/granite-3.3-8b-instruct/{0ae30d3c-098c-434f-985b-58e8179148a6.json => 5e5720d0-67fe-40a9-b65b-d4154848d83c.json} (81%)
 create mode 100644 data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json
 create mode 100644 data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json
 rename data/helm_capabilities/marin-community/marin-8b-instruct/{cc90bae5-b964-4402-9edb-5427663f01fb.json => aba1fded-b031-48df-87ef-dc744df33501.json} (80%)
 rename data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/{2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json => 98f69aa6-b227-4076-a76e-1293cbe1c6cb.json} (80%)
 rename data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/{9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json => d2bb087e-a275-4fce-b6dc-001fd4545883.json} (80%)
 rename data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/{930db2c4-d9c5-4e38-ae80-7304c2f10611.json => 84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json} (80%)
 rename data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/{226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json => 23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json} (80%)
 rename data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/{bb4e408d-505e-46c8-bd0c-7afa44a96498.json => 9cab3a77-4f32-48d0-ba11-e2323ccc4861.json} (80%)
 rename data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/{d63dad7a-f7b7-4c87-9712-3043fc117545.json => 9e037c92-1253-49be-b31a-3aa017531d77.json} (80%)
 rename data/helm_capabilities/mistralai/mistral-large-2411/{7e7f739e-9363-4c41-871d-6cf6c4145728.json => bd26c7cb-ce76-4b17-b617-d1d93a168c93.json} (81%)
 rename data/helm_capabilities/mistralai/mistral-small-2503/{853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json => 9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json} (80%)
 rename data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/{b05befca-44a5-45fb-823e-84bcc3ae81d0.json => d69a1cbe-353c-4be9-b93b-5224d24c7adf.json} (80%)
 rename data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/{2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json => 915cb39d-f21f-4ef1-a95f-f44f79ede893.json} (80%)
 rename data/helm_capabilities/moonshotai/kimi-k2-instruct/{eaeab0d7-4418-4699-9774-bc1c6711b3d3.json => fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json} (81%)
 rename data/helm_capabilities/openai/gpt-4.1-2025-04-14/{c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json => eb51f418-6abf-4b2c-9f57-0b830c00bd15.json} (81%)
 rename data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/{acaf03fd-9d4b-4fe3-8ffe-88212a786363.json => 41cd14b0-46ba-49da-844a-19fe866bef1e.json} (80%)
 rename data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/{308d3e1d-a1b9-4722-8333-23b840316e3d.json => 7de93642-a4bc-430b-8733-9befeb6a0e23.json} (80%)
 rename data/helm_capabilities/openai/gpt-4o-2024-11-20/{84a942b6-2b77-4bc2-859f-6b8d6be93558.json => 4f18292a-1fef-4feb-9b17-045c96e3e137.json} (81%)
 rename data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/{7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json => 7458c032-b24d-4f13-a659-b6e19d19a8e1.json} (80%)
 rename data/helm_capabilities/openai/gpt-5-2025-08-07/{cb444c37-e273-4aaf-881e-8a433f630053.json => 21eb1648-aad0-4297-9edc-c445e4c38694.json} (81%)
 rename data/helm_capabilities/openai/gpt-5-mini-2025-08-07/{7af059e2-b56e-46ed-b699-63e570081f16.json => 99d657ae-e850-4caf-a599-13f1b8072273.json} (81%)
 rename data/helm_capabilities/openai/gpt-5-nano-2025-08-07/{2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json => 10cd766e-442c-4b3d-833b-740417d9a6d9.json} (80%)
 create mode 100644 data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json
 rename data/helm_capabilities/openai/gpt-oss-120b/{e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json => 06719cd4-5654-49b6-9dee-e112d1601d1c.json} (80%)
 rename data/helm_capabilities/openai/gpt-oss-20b/{acb07214-c0f3-4006-8a3b-23793891a1bf.json => ed849999-48c2-4569-8bcd-dc73084e3197.json} (80%)
 rename data/helm_capabilities/openai/o3-2025-04-16/{a1c5d581-be98-4e1e-ba14-ca922bfac035.json => 01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json} (80%)
 rename data/helm_capabilities/openai/o4-mini-2025-04-16/{c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json => 32382d69-21c7-43a9-bb95-27607ec18cc9.json} (80%)
 rename data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/{f6d74c93-0e96-4fc5-987c-18a79dbde17c.json => 77e702f7-37ef-4487-b047-74b13ef6d966.json} (80%)
 rename data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/{f96da103-5350-4b1b-b33e-6ced1f1f7815.json => 4ee3c647-740c-41a6-ac66-4a38b09317ff.json} (80%)
 rename data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/{27bae7f2-92dd-4feb-9050-2d11c6da2d61.json => ca30726a-00a6-4228-94fe-5dce00de1d5e.json} (81%)
 rename data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/{0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json => 7862890a-298b-4bda-b8f1-7be6a5779365.json} (81%)
 create mode 100644 data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json
 rename data/helm_capabilities/writer/palmyra-fin/{39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json => 442aed0d-95c3-4436-ad63-b7b1e93307f4.json} (80%)
 rename data/helm_capabilities/writer/palmyra-med/{1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json => 7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json} (80%)
 rename data/helm_capabilities/writer/palmyra-x-004/{01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json => bc2c91e0-6afd-4e44-b665-d5c7558f8981.json} (80%)
 rename data/helm_capabilities/writer/palmyra-x5/{c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json => a74b74f7-ccce-4341-a122-26728cc6bece.json} (80%)
 rename data/helm_capabilities/xai/grok-3-beta/{24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json => 87811b75-afe8-413b-949d-7fd1f582a2e8.json} (80%)
 rename data/helm_capabilities/xai/grok-3-mini-beta/{b028eaaf-bc4d-4918-8464-f8c4b0c74973.json => ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json} (80%)
 rename data/helm_capabilities/xai/grok-4-0709/{c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json => 924080a0-c530-4e6d-b1a4-107de3bd7183.json} (80%)
 rename data/helm_capabilities/zai-org/glm-4.5-air-fp8/{7b231b0d-89b8-4a0a-825e-ccfea212f565.json => be23c720-a99a-4945-bc0b-ddc27c8eec39.json} (81%)
 rename data/helm_classic/{anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json => Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json} (92%)
 rename data/helm_classic/ai21/J1-Grande-v1-17B/{09f5c502-2950-48fb-b25f-b562eeee26c8.json => c12a8494-bafc-4097-874a-7c00636e96f8.json} (92%)
 rename data/helm_classic/ai21/J1-Grande-v2-beta-17B/{3d13f9ba-b18e-4b52-b28d-9aed0621903d.json => 4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json} (92%)
 rename data/helm_classic/ai21/J1-Jumbo-v1-178B/{3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json => 19f61327-fcc3-408f-9254-2d6a2aadcd4e.json} (92%)
 rename data/helm_classic/ai21/J1-Large-v1-7.5B/{1ab7f23a-7527-4188-9141-852f5123eb19.json => ccc17d56-bd26-409c-ac3f-262eaba9ce21.json} (92%)
 rename data/helm_classic/ai21/Jurassic-2-Grande-17B/{f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json => f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json} (92%)
 rename data/helm_classic/ai21/Jurassic-2-Jumbo-178B/{ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json => 9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json} (92%)
 rename data/helm_classic/ai21/Jurassic-2-Large-7.5B/{67114722-a441-478b-a324-2c32be7e06a7.json => f25c142c-8730-4241-a649-01d076e1f28d.json} (91%)
 rename data/helm_classic/aleph-alpha/Luminous-Base-13B/{07fa437f-398d-48ab-a74d-b8c59caf3add.json => ab34f23e-36db-40c0-9681-f30b00692f98.json} (92%)
 rename data/helm_classic/aleph-alpha/Luminous-Extended-30B/{7492964a-2c16-4261-aaca-dbcd4f3be7c3.json => 67281534-a03d-49d8-a586-25cb1a03134e.json} (92%)
 rename data/helm_classic/aleph-alpha/Luminous-Supreme-70B/{b5dace02-416d-4b90-90e1-562b22820784.json => 3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json} (92%)
 rename data/helm_classic/bigscience/BLOOM-176B/{0e6cd483-dff8-4fba-9239-82cb0fe34d42.json => 04ce2ba4-c382-4658-ba06-1def9499a243.json} (92%)
 rename data/helm_classic/bigscience/T0pp-11B/{9ae59291-604f-4527-812a-a3150a1098f2.json => 3a546396-d031-4958-8410-00e0d3406089.json} (93%)
 rename data/helm_classic/cohere/Cohere-Command-beta-52.4B/{52026df3-2452-4fd2-a10b-73a2bfc5397e.json => e7b99aa6-08e8-4224-a805-16586eb44325.json} (92%)
 rename data/helm_classic/cohere/Cohere-Command-beta-6.1B/{19b97859-5af3-4883-a878-93d026c29d87.json => 43a3fe19-929a-463d-a0ed-791dad765188.json} (92%)
 rename data/helm_classic/cohere/Cohere-large-v20220720-13.1B/{37af5185-3599-49f5-9637-55d41bc6ae81.json => 75468958-b75b-41fe-9813-070b793e86d9.json} (92%)
 rename data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/{cf32b49f-7cf8-43a3-9e28-ade7446272ab.json => 6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json} (92%)
 rename data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/{ad9bd354-01d9-4a21-a299-a53190e1eb7e.json => 3c9c425a-ce4a-4958-9744-7f9490ed5729.json} (92%)
 rename data/helm_classic/cohere/Cohere-small-v20220720-410M/{12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json => 5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json} (92%)
 rename data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/{ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json => 8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json} (92%)
 rename data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/{d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json => f8044c74-3f1c-4562-a21c-e448061b2077.json} (92%)
 rename data/helm_classic/{eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json => eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json} (91%)
 rename data/helm_classic/{eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json => eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json} (91%)
 rename data/helm_classic/{writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json => google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json} (91%)
 rename data/helm_classic/google/T5-11B/{df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json => 52db5c6d-b54e-401a-880d-8ab41a394bc4.json} (92%)
 rename data/helm_classic/google/UL2-20B/{ac49ac68-0d7f-4972-bb99-0332b14df2d5.json => 68becad6-9455-4d3d-8d68-d1b4448598a1.json} (92%)
 rename data/helm_classic/lmsys/Vicuna-v1.3-13B/{39f4648c-6635-4ffa-86f5-040e69f3e054.json => 519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json} (91%)
 rename data/helm_classic/lmsys/Vicuna-v1.3-7B/{4ef38a9d-283c-4549-8de3-d04ce7f62542.json => 972bc5db-f536-42f9-aa51-83cc2f59b76a.json} (91%)
 rename data/helm_classic/meta/LLaMA-13B/{81eee874-47be-4a55-af47-5b3e1bcbd361.json => b2220101-56e0-49d9-a3d1-d3bec769ab97.json} (91%)
 rename data/helm_classic/meta/LLaMA-30B/{2a23b568-daed-4783-9c51-5218216f5f19.json => 96907b25-05c3-441b-afc4-69274c20bfc3.json} (91%)
 rename data/helm_classic/meta/LLaMA-65B/{584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json => 66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json} (91%)
 rename data/helm_classic/meta/LLaMA-7B/{6a2445e0-75d4-4434-aabd-645fd445a920.json => 70e9e156-6807-489b-b77a-367236614826.json} (91%)
 rename data/helm_classic/meta/Llama-2-13B/{f5d57067-8a00-490f-b1bf-30afd0b0f126.json => e90cfb46-1173-4d22-9329-9bf57cdd5241.json} (91%)
 rename data/helm_classic/meta/Llama-2-70B/{cb8802af-613e-42a1-b025-31532996eb10.json => baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json} (91%)
 rename data/helm_classic/meta/Llama-2-7B/{ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json => 7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json} (91%)
 rename data/helm_classic/meta/OPT-175B/{75a5843f-73a4-4ff3-94b5-184152ff703c.json => ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json} (92%)
 rename data/helm_classic/meta/OPT-66B/{83d19197-aebd-43fa-a7ed-20818a9e5d8e.json => 26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json} (92%)
 rename data/helm_classic/microsoft/TNLG-v2-530B/{dd121d07-5198-4ac6-81d6-df38485bff25.json => ecd21c26-cdc4-43b1-b933-4d970df9413a.json} (92%)
 rename data/helm_classic/microsoft/TNLG-v2-6.7B/{f23680f4-8b5a-4baf-9e8d-74f0f4847183.json => 9d4350eb-cdf0-432f-b3b0-45f4832ca950.json} (92%)
 rename data/helm_classic/{mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json => mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json} (91%)
 rename data/helm_classic/mosaicml/MPT-30B/{cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json => b277c87e-54b5-466f-97d7-35db4cd7b985.json} (91%)
 rename data/helm_classic/mosaicml/MPT-Instruct-30B/{182a7373-7ea3-4f2b-b730-af16e20b9fa7.json => 270df23b-9e58-4259-a8ed-0d25b9c80b2a.json} (91%)
 rename data/helm_classic/{eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json => openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json} (92%)
 rename data/helm_classic/{eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json => openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json} (92%)
 rename data/helm_classic/openai/ada-350M/{f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json => e6ea5f7e-0533-4a99-8638-1cc10c454238.json} (94%)
 rename data/helm_classic/openai/babbage-1.3B/{1c4a54f3-4599-441b-8f30-5e275a0597a7.json => 83c924fe-6318-4bad-adb0-8a81e5e28ee0.json} (94%)
 rename data/helm_classic/openai/curie-6.7B/{dbefbdbd-b64e-40e9-b632-0dcae3f33913.json => 82e2c0e3-66f2-431f-b4b8-d2495970d998.json} (94%)
 rename data/helm_classic/openai/davinci-175B/{f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json => 6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json} (94%)
 rename data/helm_classic/openai/gpt-3.5-turbo-0301/{2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json => e18fbf9e-677c-49fb-ab76-475e8f605f01.json} (91%)
 rename data/helm_classic/openai/gpt-3.5-turbo-0613/{826d8e72-7332-48b1-af41-537e505c9e11.json => 039af363-0c5c-4e36-8396-cd57c7e4c1de.json} (91%)
 rename data/helm_classic/openai/text-ada-001/{c34ec087-f3a1-49f1-8ff7-79f353171c4c.json => 8ea1facb-260a-461d-9271-2c07b318c46f.json} (94%)
 rename data/helm_classic/openai/text-babbage-001/{09763c40-c365-4be9-befc-970ce1886641.json => 93007ac9-04c2-451d-abd2-2f235297747e.json} (94%)
 rename data/helm_classic/openai/text-curie-001/{4ece7c38-114a-4973-ba13-ac3821c9836f.json => b04e5f90-e46e-4d7a-a6a9-569bde072208.json} (94%)
 rename data/helm_classic/openai/text-davinci-002/{75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json => 933dc76f-45f0-48e0-93ae-3e19cff87c2a.json} (94%)
 rename data/helm_classic/openai/text-davinci-003/{0c43aeaf-c7d3-4e00-8b84-5115a6396585.json => b8408a64-eb89-4337-8ee5-3c48e4e24437.json} (94%)
 rename data/helm_classic/stanford/Alpaca-7B/{d25691b8-37e7-42ff-b59a-8684197280f1.json => d5846321-0800-4ff9-b85c-53c8b4884ba5.json} (91%)
 rename data/helm_classic/{tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json => tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json} (91%)
 rename data/helm_classic/{tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json => tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json} (91%)
 rename data/helm_classic/{tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json => tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json} (91%)
 rename data/helm_classic/{tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json => tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json} (91%)
 rename data/helm_classic/together/RedPajama-INCITE-Base-7B/{8db87a70-babc-4776-8317-70752d3c5546.json => 3a329574-dcf6-4177-b37c-c495e6af6cc5.json} (91%)
 rename data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/{3da308fb-2403-432e-bde3-3b14af627552.json => 9e662c1e-e77c-4fb3-b589-127683a4b2ca.json} (91%)
 rename data/helm_classic/together/RedPajama-INCITE-Instruct-7B/{fd8f7b08-813c-4369-bfe4-d86eacc874ea.json => 375140f6-bd3f-4b55-a35c-23de37254296.json} (91%)
 rename data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/{e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json => 021d0b25-8f58-47da-a58c-ac532a7972bf.json} (91%)
 rename data/helm_classic/writer/InstructPalmyra-30B/{bcf54365-b229-4abf-8ff8-59b4b46fa829.json => 9207fec4-d0c4-4f66-b917-f5ed57409215.json} (91%)
 rename data/helm_classic/yandex/YaLM-100B/{eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json => b04c8845-cccf-4856-9597-ab283bb2ec8d.json} (91%)
 rename data/helm_classic/zhipu-ai/GLM-130B/{f45719e5-3334-4e1d-8a83-f5f8292cb977.json => 4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json} (91%)
 rename data/helm_instruct/anthropic/claude-v1.3/{c4e55239-581b-433f-82bc-68a690f59e4a.json => 0e30e895-aaf7-42d4-95db-7541d6b41c87.json} (61%)
 rename data/helm_instruct/cohere/command-xlarge-beta/{8a68cccf-2965-4867-b922-460cc5b695de.json => 4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json} (61%)
 rename data/helm_instruct/openai/gpt-3.5-turbo-0613/{a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json => 8befd29c-a16d-4e05-a92f-00b621d45e03.json} (61%)
 rename data/helm_instruct/openai/gpt-4-0314/{d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json => b2e193b8-215b-4e80-9d5a-df11f1dac88a.json} (61%)
 rename data/helm_lite/01-ai/yi-34b/{3b8567cf-40f0-4d63-ad12-9b1712a2c503.json => eedd0f38-6d26-4297-a469-291227ec6be6.json} (82%)
 rename data/helm_lite/01-ai/yi-6b/{3b94c757-b54d-462c-a2a1-d331711a0833.json => 74c47665-740f-4784-8a27-1c1d1c29bff8.json} (82%)
 rename data/helm_lite/01-ai/yi-large-preview/{3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json => 8027b577-7f48-4df5-9879-bd45ac342f42.json} (82%)
 rename data/helm_lite/AlephAlpha/luminous-base/{b4fa23d2-48cd-4a58-b70d-25b466781008.json => e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json} (82%)
 rename data/helm_lite/AlephAlpha/luminous-extended/{818cfaa1-815b-4a13-b017-5e6c30ed9de3.json => 24e11e7b-15d6-4a09-9545-38486d0eb236.json} (82%)
 rename data/helm_lite/AlephAlpha/luminous-supreme/{62727554-ab2c-4218-9c3c-3eba48420834.json => eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json} (82%)
 rename data/helm_lite/ai21/j2-grande/{c58c4299-ede8-46b6-8d33-2f900c272853.json => 52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json} (82%)
 rename data/helm_lite/ai21/j2-jumbo/{bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json => 68713712-ae92-474b-84c0-1b8301538439.json} (82%)
 rename data/helm_lite/ai21/jamba-1.5-large/{38918b97-2707-4b53-99a8-7a67816f398c.json => 15cc9411-6ea4-4f10-831f-23ff27fd5704.json} (82%)
 rename data/helm_lite/ai21/jamba-1.5-mini/{82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json => 3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json} (82%)
 rename data/helm_lite/ai21/jamba-instruct/{9278a23a-cecd-446c-b234-2301e1e44c40.json => 1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json} (82%)
 rename data/helm_lite/allenai/olmo-7b/{81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json => 078d812b-2198-4497-8fbe-06fb640fd86d.json} (82%)
 rename data/helm_lite/amazon/{nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json => nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json} (82%)
 rename data/helm_lite/amazon/{nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json => nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json} (82%)
 rename data/helm_lite/amazon/{nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json => nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json} (82%)
 rename data/helm_lite/anthropic/claude-2.0/{b2b9e87c-76de-4716-8d28-4b13a34c360f.json => 0684c1d2-ea43-4341-820c-09051f5e11f2.json} (82%)
 rename data/helm_lite/anthropic/claude-2.1/{0bd11df6-a037-4f55-a78a-cc23c34c0958.json => 51821ca1-7eac-4094-abac-98b2484cc5a0.json} (82%)
 rename data/helm_lite/anthropic/claude-3-5-haiku-20241022/{f4061c6a-f82f-4642-a734-f6adb0be7519.json => 8a0f5749-7f6a-4813-9c08-7283433c1337.json} (82%)
 rename data/helm_lite/anthropic/claude-3-5-sonnet-20240620/{18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json => 4697983d-a29a-484d-9268-7974117456e8.json} (82%)
 rename data/helm_lite/anthropic/claude-3-5-sonnet-20241022/{d0cd5626-5b2c-46df-b265-e130a789a0e7.json => 60e33aa3-0593-42e6-9baa-8311746deca0.json} (82%)
 rename data/helm_lite/anthropic/claude-3-haiku-20240307/{3eea5b0f-1126-448f-94e5-52a874baa61a.json => 2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json} (82%)
 rename data/helm_lite/anthropic/claude-3-opus-20240229/{9fa44303-4699-47f2-9777-0c118e36d87e.json => 9ad91ee2-7a64-4f94-9166-f2681777023b.json} (82%)
 rename data/helm_lite/anthropic/claude-3-sonnet-20240229/{a2d019d6-52bf-439f-90f0-74583928e5c0.json => 4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json} (82%)
 rename data/helm_lite/anthropic/claude-instant-1.2/{0f884c98-ea5e-4409-81e2-40aa5c84f99d.json => 64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json} (82%)
 rename data/helm_lite/anthropic/claude-v1.3/{2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json => fe8a36b0-4361-461b-b310-656c54131fa6.json} (82%)
 rename data/helm_lite/cohere/command-light/{8c312031-5da7-4816-8207-056fe1bc161d.json => b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json} (82%)
 rename data/helm_lite/cohere/command-r-plus/{71c0558f-7b56-40ea-a1be-2749b88758c7.json => 67967a2a-5fb4-46e8-b1ec-eda1588d9086.json} (82%)
 rename data/helm_lite/cohere/command-r/{d1330068-2c16-450e-8ce5-1d05f5e842d9.json => 0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json} (82%)
 rename data/helm_lite/cohere/command/{dec04718-1ae9-4e4b-92da-01d789424f69.json => ba5eea81-2120-4a20-8322-dfbd29cd197c.json} (82%)
 rename data/helm_lite/databricks/dbrx-instruct/{ba50499a-6cfd-4f04-aab5-c2122202cc74.json => 9dd66ede-da5c-4627-92ed-7057c9a2bea3.json} (82%)
 rename data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/{35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json => 801aa7da-90b2-48d1-ad3d-943b06bd437c.json} (82%)
 rename data/helm_lite/deepseek-ai/deepseek-v3/{d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json => a58923ea-fa22-4c45-8327-efbe84c8a05d.json} (82%)
 rename data/helm_lite/google/gemini-1.0-pro-002/{1e98157d-49e6-4d66-ae21-a95d419c47e3.json => bab8d241-fad0-4230-b213-c2eeccc79f12.json} (82%)
 rename data/helm_lite/google/gemini-1.5-flash-001/{e92bce18-690a-44eb-8bc5-28e9303473bb.json => 65e37589-ef26-46cd-a627-798af70e75bf.json} (82%)
 rename data/helm_lite/google/gemini-1.5-flash-002/{3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json => f499f9c6-4c9a-43ba-b4c3-d094494a371c.json} (82%)
 rename data/helm_lite/google/gemini-1.5-pro-001/{b1ecfc78-f59e-437f-b163-9253ad092799.json => 27a54446-57b2-4239-b768-7ab85dc94c54.json} (82%)
 rename data/helm_lite/google/gemini-1.5-pro-002/{04415dda-306f-420c-8af8-54336368fc40.json => 5de8a13e-a029-4a90-9a2d-c28a59212140.json} (82%)
 rename data/helm_lite/google/gemini-2.0-flash-exp/{ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json => f9643ce2-7347-401b-903e-fadcc5221f36.json} (82%)
 rename data/helm_lite/google/gemma-2-27b-it/{5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json => 9932e430-2039-40b0-bc8f-ae2d833543e8.json} (82%)
 rename data/helm_lite/google/gemma-2-9b-it/{63af45df-c46d-46df-8f3e-592181ce6a7a.json => dbd2e9bb-c2ca-4165-b229-d736a70721a5.json} (82%)
 rename data/helm_lite/google/gemma-7b/{aad88f1f-6047-45e7-8b0f-d5deac20be68.json => 32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json} (82%)
 rename data/helm_lite/google/text-bison@001/{f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json => 70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json} (82%)
 rename data/helm_lite/google/text-unicorn@001/{35f70e20-8a08-4f7c-b822-5238337d4177.json => 07a367ee-2879-4ede-bbf8-33b24d682467.json} (82%)
 rename data/helm_lite/meta/llama-2-13b/{e19c56fc-5f6c-48a0-874a-97665283e6f0.json => fee914c7-d6bf-4d61-9f50-71bae5f11006.json} (82%)
 rename data/helm_lite/meta/llama-2-70b/{98a0c9bb-9679-4cc5-85b8-8801dbb965de.json => b0577066-231e-461b-bae8-b724b204397a.json} (82%)
 rename data/helm_lite/meta/llama-2-7b/{fad21bfe-048f-412c-b3fd-9b43d276b2a2.json => b79fe2e3-5eec-46f8-90a1-810781c8c46a.json} (82%)
 rename data/helm_lite/meta/llama-3-70b/{b1e28406-d88d-4acd-a268-7baebc9b565a.json => 998616ef-5d1b-4c65-b6ad-23afc3630d5a.json} (82%)
 rename data/helm_lite/meta/llama-3-8b/{60696eaf-669d-49bf-bebe-6cd171522faa.json => fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json} (82%)
 rename data/helm_lite/meta/llama-3.1-405b-instruct-turbo/{ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json => 25fde5e6-86b8-4a80-8f79-5946ef9999fc.json} (82%)
 rename data/helm_lite/meta/llama-3.1-70b-instruct-turbo/{c3b72d96-9af5-4e32-b420-e85a88e82e5a.json => b955825d-ae7f-48c4-9dad-5ee78879737d.json} (82%)
 rename data/helm_lite/meta/llama-3.1-8b-instruct-turbo/{57b2177d-0232-41ca-aa3a-b2ecb7af7586.json => 168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json} (82%)
 rename data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/{6ed32ce2-18e5-4d1b-94f8-443f81892275.json => 0807e353-9787-4ca0-8f7b-50d1bed2469e.json} (82%)
 rename data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/{5c11f938-7933-45ae-8530-05dac1012f10.json => 0164b885-2c27-4eba-8e6f-e69156cb0dee.json} (82%)
 rename data/helm_lite/meta/llama-3.3-70b-instruct-turbo/{2b9e00e5-15e1-45ea-a345-32a3d84460fb.json => 08422837-51a0-45c9-9004-fc5d98dce462.json} (82%)
 rename data/helm_lite/meta/llama-65b/{3e27a5c3-a752-4790-b219-5964331e40ac.json => 39f2c7f2-56d4-4349-95ae-374d34263f48.json} (82%)
 rename data/helm_lite/microsoft/phi-2/{061081c1-6044-40ec-b4a7-1668b8f3ba4f.json => 0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json} (82%)
 rename data/helm_lite/microsoft/phi-3-medium-4k-instruct/{33df0ce7-048b-4a1b-816c-a6221afe41de.json => 75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json} (82%)
 rename data/helm_lite/microsoft/phi-3-small-8k-instruct/{a3f47cc2-0563-4285-b777-0fcc3c642249.json => 2de4b89a-3f3b-4d1d-ba85-030953a46956.json} (82%)
 rename data/helm_lite/mistralai/mistral-7b-instruct-v0.3/{067ef4d7-387c-4c09-a1c4-a10af69811f0.json => bd68405f-fe9a-448b-9c80-468c656594e5.json} (82%)
 rename data/helm_lite/mistralai/mistral-7b-v0.1/{0a07f39c-745a-46c3-ad11-c79a50cc18bb.json => 4267fef1-3180-46e3-990e-0d1092ec4c18.json} (82%)
 rename data/helm_lite/mistralai/mistral-large-2402/{35797854-d46a-4646-94a2-3acf1d484418.json => 002a34dc-39e5-451d-b2a8-b51bdb69a056.json} (82%)
 rename data/helm_lite/mistralai/mistral-large-2407/{3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json => 5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json} (82%)
 rename data/helm_lite/mistralai/mistral-medium-2312/{33bd2b4e-0292-47b7-84de-de6ff5804257.json => ad2beded-cec3-4b47-b8de-a32a3225fa66.json} (82%)
 rename data/helm_lite/mistralai/mistral-small-2402/{67edb54d-efed-4a23-97ef-6d2a9f254ae1.json => eb901347-fc1f-4d8f-a70a-05a83e16658d.json} (82%)
 rename data/helm_lite/mistralai/mixtral-8x22b/{ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json => 9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json} (82%)
 rename data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/{469d069f-581e-415c-9c9d-f57e7c972da5.json => 042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json} (82%)
 rename data/helm_lite/mistralai/open-mistral-nemo-2407/{c9a3f927-041f-47cf-ae02-03fe4be0a59e.json => d2d48e4a-0484-4f44-8108-2e689d7ca695.json} (82%)
 rename data/helm_lite/openai/gpt-3.5-turbo-0613/{1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json => e54ae605-a91d-47d7-a08d-67bd0ea5c606.json} (82%)
 rename data/helm_lite/openai/gpt-4-0613/{4e58fdd9-e14c-441a-a9fb-4c525a615880.json => 15dccf75-871d-457b-8495-e0d03d550360.json} (82%)
 rename data/helm_lite/openai/gpt-4-1106-preview/{252ec309-9b98-463e-aee4-6e28deb6dcfb.json => 18fe5d30-bf36-405a-819e-1ecabda327ea.json} (82%)
 rename data/helm_lite/openai/gpt-4-turbo-2024-04-09/{5530c426-2321-4aa3-b860-f9b764b7b748.json => cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json} (82%)
 rename data/helm_lite/openai/gpt-4o-2024-05-13/{da92cfe0-b066-416a-9408-3eb9d36b9fb3.json => cd199905-04a4-4745-b848-4f7bde97ca17.json} (82%)
 rename data/helm_lite/openai/gpt-4o-2024-08-06/{2a752701-a826-4316-b3eb-e9eec90a5a89.json => 1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json} (82%)
 rename data/helm_lite/openai/gpt-4o-mini-2024-07-18/{bea4af4b-8155-4784-9192-b40270d574af.json => bfd70aff-bf45-4f55-b730-4924afc181cd.json} (82%)
 rename data/helm_lite/openai/text-davinci-002/{d08eccd1-602c-4d64-a487-2d9c028459a0.json => b6e08679-1bd7-42a1-9eee-98252de2c7c1.json} (82%)
 rename data/helm_lite/openai/text-davinci-003/{3cceb22d-7ce9-49a1-a677-548a97c52970.json => 22b411d5-a314-4b17-a9c7-c1af7ca7df33.json} (82%)
 rename data/helm_lite/qwen/qwen1.5-110b-chat/{6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json => f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json} (82%)
 rename data/helm_lite/qwen/qwen1.5-14b/{9b1ee735-bc25-48fd-94cd-24f17edcdc21.json => fb1bb023-16f6-4914-889b-6458d7ab1277.json} (82%)
 rename data/helm_lite/qwen/qwen1.5-32b/{a648cb90-bcce-4171-a664-df0b19304833.json => 8b572c10-3553-4e51-a321-bdb05996914b.json} (82%)
 rename data/helm_lite/qwen/qwen1.5-72b/{5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json => 6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json} (82%)
 rename data/helm_lite/qwen/qwen1.5-7b/{71d69629-11b9-4f06-98ca-536f1ab22f2c.json => e0efe169-d28e-418e-a78c-9b04ec29aae2.json} (82%)
 rename data/helm_lite/qwen/qwen2-72b-instruct/{a594b434-eeb2-41f5-b23d-eea23ed2add2.json => 05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json} (82%)
 rename data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/{e6a833e5-6b86-4d32-be03-010fdfde3ffc.json => 983696ae-d7f3-48a4-b7a0-a42487728182.json} (82%)
 rename data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/{cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json => a969e516-adef-4839-9252-244c58ab3c67.json} (82%)
 rename data/helm_lite/snowflake/snowflake-arctic-instruct/{2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json => f122f9de-b1ce-40ea-8731-6c00c7af0498.json} (82%)
 rename data/helm_lite/tiiuae/falcon-40b/{346c2a85-3daf-41e9-9305-78851dcf05ae.json => 5c7982c5-3513-4ff2-9857-33a0db825376.json} (82%)
 rename data/helm_lite/tiiuae/falcon-7b/{69e02d7b-d536-4ff4-a58e-b880ff87f357.json => 4910859a-750c-4728-bf30-309e0e81690e.json} (82%)
 rename data/helm_lite/upstage/solar-pro-241126/{3286a69f-cdba-49a5-939a-e14ad759e7a4.json => 32f0532f-b504-492d-84d7-f541930edad0.json} (82%)
 rename data/helm_lite/writer/palmyra-x-004/{b798adc1-01f0-46c5-95a4-8b67199d624b.json => 04c187a3-4532-4523-b39d-19314d61c779.json} (82%)
 rename data/helm_lite/writer/palmyra-x-v2/{7a07a202-aa88-47fc-987d-6d44a57b6985.json => 4440532c-9b49-4c9a-8bf4-f122531c54fa.json} (82%)
 rename data/helm_lite/writer/palmyra-x-v3/{ac0a7249-11e7-493d-9190-8c1913bb1c42.json => bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json} (82%)
 rename data/helm_mmlu/01-ai/yi-34b/{73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json => 3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json} (82%)
 rename data/helm_mmlu/01-ai/yi-6b/{97569bf5-1e12-4baa-80cc-019be1725ebb.json => 6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json} (82%)
 rename data/helm_mmlu/01-ai/yi-large-preview/{7c4b387f-45be-41cb-8102-cd738e60f99d.json => 3d0b3d68-a853-4989-a35e-83ac6722c2da.json} (82%)
 rename data/helm_mmlu/ai21/jamba-1.5-large/{027b7bd4-8943-4d2c-9674-15d33792d391.json => ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json} (82%)
 rename data/helm_mmlu/ai21/jamba-1.5-mini/{e5ed6c70-6874-4671-abb0-25bbd82471b4.json => 517e8027-6edd-482b-86f3-33b6c41a9609.json} (82%)
 rename data/helm_mmlu/ai21/jamba-instruct/{4e236f80-5d03-4547-b199-b8718439fbed.json => f7c1c125-ad0f-4847-b880-4f705f1666c6.json} (82%)
 rename data/helm_mmlu/allenai/olmo-1.7-7b/{1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json => 5a0ba280-8a12-4735-9d92-4ed71ba395b4.json} (82%)
 rename data/helm_mmlu/allenai/olmo-7b/{31666792-6d68-42da-95f8-3b9f8590c7fd.json => 73ccc6a6-e10d-4619-914f-26032cddf8da.json} (82%)
 rename data/helm_mmlu/amazon/{nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json => nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json} (82%)
 rename data/helm_mmlu/amazon/{nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json => nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json} (82%)
 rename data/helm_mmlu/amazon/{nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json => nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json} (82%)
 rename data/helm_mmlu/anthropic/claude-2.1/{357edc36-d500-4e6e-94a4-6653b769b5d8.json => aa8cae95-cb75-4241-951c-25e2046042dd.json} (82%)
 rename data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/{67f72a7f-15b7-4a2e-b478-38091cba2189.json => c88e4a03-22ae-4338-bf5f-36070814136a.json} (82%)
 rename data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/{3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json => 4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json} (82%)
 rename data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/{f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json => ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json} (82%)
 rename data/helm_mmlu/anthropic/claude-3-haiku-20240307/{b0218eab-984f-4829-90d6-e7fc6f60c530.json => 097a8da1-f411-4359-8440-2ab06f4ae76c.json} (82%)
 rename data/helm_mmlu/anthropic/claude-3-opus-20240229/{fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json => 68130abd-1df5-4cd3-919a-2863e9f013c7.json} (82%)
 rename data/helm_mmlu/anthropic/claude-3-sonnet-20240229/{08d951d1-2912-4a00-99ce-f90340a7fd2a.json => 5d8d795a-d213-4b96-9b17-ad5fae6b3687.json} (82%)
 rename data/helm_mmlu/anthropic/claude-instant-1.2/{bfff8f1b-24cc-41b8-b11c-85ee48bef059.json => 7908da03-f030-4c62-a121-c04bd94ea75e.json} (82%)
 rename data/helm_mmlu/cohere/command-r-plus/{f1509273-dea1-477e-bf04-02767838c1f9.json => c6fdbf96-2500-4410-8fcd-268ea3e16062.json} (82%)
 rename data/helm_mmlu/cohere/command-r/{45524eef-0678-47db-8620-a5f55e166e63.json => 537164c3-7b88-4543-b19d-370f55a25a66.json} (82%)
 rename data/helm_mmlu/databricks/dbrx-instruct/{cd2371e9-e552-4944-bc30-c2269c960e16.json => 0c539e26-8403-42db-acfc-7953dd80ae20.json} (82%)
 rename data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/{7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json => 364c7490-8bb1-4e7e-b485-fb3c2224da58.json} (82%)
 rename data/helm_mmlu/deepseek-ai/deepseek-v3/{87716ef9-56bb-4737-b578-9e53742c714a.json => 1a9167d2-882c-4582-b4e0-ac425896a317.json} (82%)
 rename data/helm_mmlu/google/gemini-1.0-pro-001/{8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json => 8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json} (82%)
 rename data/helm_mmlu/google/gemini-1.5-flash-001/{ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json => d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json} (82%)
 rename data/helm_mmlu/google/gemini-1.5-flash-002/{ec78481a-0b0d-4709-99ea-6423372d6038.json => a94c9e13-dca7-4e02-a795-09d9274354d3.json} (82%)
 rename data/helm_mmlu/google/gemini-1.5-flash-preview-0514/{2a8845b3-cdbc-409c-8346-f83fb607999a.json => 75c8b20f-a4d4-4699-be79-f027bf7f0d69.json} (82%)
 rename data/helm_mmlu/google/gemini-1.5-pro-001/{486b6479-f327-43ab-af2c-8824abaf5fe6.json => 264be7b4-08b7-40b6-a5e7-f3536f361450.json} (82%)
 rename data/helm_mmlu/google/gemini-1.5-pro-002/{4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json => 83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json} (82%)
 rename data/helm_mmlu/google/gemini-1.5-pro-preview-0409/{bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json => 8a013eb3-0f21-4a50-8a53-4ba977951130.json} (82%)
 rename data/helm_mmlu/google/gemini-2.0-flash-exp/{0837a2fd-1f25-4133-9ce6-b8ca29830f70.json => 7b081a40-7cb6-4405-b842-3db95f290dfa.json} (82%)
 rename data/helm_mmlu/google/gemma-2-27b/{b732e4c3-526e-42b3-8003-defe6f99dec5.json => 54185b53-9891-43c6-8f93-09ff02b728d8.json} (82%)
 rename data/helm_mmlu/google/gemma-2-9b/{72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json => 884c194d-6519-4bd4-8add-6514e593c514.json} (82%)
 rename data/helm_mmlu/google/gemma-7b/{11b66d50-28d9-42bc-8f91-463b02fa96f7.json => a80cbd76-bcf8-4174-b0b3-346fae152bdb.json} (82%)
 rename data/helm_mmlu/google/text-bison@001/{70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json => 5f105986-aa7d-4858-91bc-cece9d0085ba.json} (82%)
 rename data/helm_mmlu/google/text-unicorn@001/{c2e53d3a-b85c-4888-8b20-225db39301ab.json => 528b7b4e-c8a6-4387-bd98-497a3316029d.json} (82%)
 rename data/helm_mmlu/meta/llama-2-13b/{a477c332-b082-4ad5-8d2f-905690e9d211.json => 96eb34db-66bd-4945-8b4c-a8c1394fe56a.json} (82%)
 rename data/helm_mmlu/meta/llama-2-70b/{ba574f5e-cc59-4994-a595-e6472c032fc4.json => 961e917b-0e67-462c-b9d0-0fe4b4b85beb.json} (82%)
 rename data/helm_mmlu/meta/llama-2-7b/{9cfa7f91-bfd0-4f02-988c-1978df8db303.json => 59a85d2c-16ce-4ed4-bc65-f6898127fa57.json} (82%)
 rename data/helm_mmlu/meta/llama-3-70b/{607a4b9b-3442-4690-b116-a927c6822fb3.json => 16a8b446-51fc-4c23-9231-46ee16c1c0a8.json} (82%)
 rename data/helm_mmlu/meta/llama-3-8b/{44decfe6-57ed-4677-a859-4fe5ae25b237.json => f4de7e58-7060-440b-8f6f-1f79d7499d1e.json} (82%)
 rename data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/{af78c3b5-5d91-431d-85ac-783b5a324723.json => 5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json} (82%)
 rename data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/{1224cee0-22f8-41b0-a7da-8a6100001a3e.json => dc6aa933-67e4-4811-b3e2-e5200c002abe.json} (82%)
 rename data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/{2cb2551b-dbca-46d9-a19a-165d1ac60dee.json => 5f9758a3-fd6d-4598-930a-9c01420d05e8.json} (82%)
 rename data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/{3c53ce3d-4ee8-483c-be9f-964395103289.json => 7592c0d8-a06c-4189-81a1-dbf794d22c8b.json} (82%)
 rename data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/{11e364be-39e9-4b42-97d7-ab771f17973c.json => 83c0e8e3-087c-4d61-9153-e571b4971871.json} (82%)
 rename data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/{bbcf8f14-600c-4c93-b63d-64aabcab23a3.json => c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json} (82%)
 rename data/helm_mmlu/microsoft/phi-2/{91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json => 5baac093-babb-41cd-a2f4-985d0b91be37.json} (82%)
 rename data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/{e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json => 1bf54088-ba12-45b4-8f80-63d5c38f58f6.json} (82%)
 rename data/helm_mmlu/microsoft/phi-3-small-8k-instruct/{16c66bdf-dda3-4b12-b38c-73abee6a702f.json => 5ed0a970-200f-4f23-9623-e714afa49ddf.json} (82%)
 rename data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/{d0783259-681a-438f-b7dc-1c625a0be8ba.json => e7fd06a6-65e5-4f88-8e86-c513f78e31db.json} (82%)
 rename data/helm_mmlu/mistralai/mistral-7b-v0.1/{a05ce725-cdf0-4fe3-88b9-8631229e4443.json => ac047aef-008f-4c87-a6d5-4f331ebf5c53.json} (82%)
 rename data/helm_mmlu/mistralai/mistral-large-2402/{0dee4200-c4f0-438e-8d0d-ca92515c6e33.json => ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json} (82%)
 rename data/helm_mmlu/mistralai/mistral-large-2407/{2869d585-567d-4ddc-ac38-3e036061b13e.json => 7517b6c9-c613-416c-aadb-39fd6d252da7.json} (82%)
 rename data/helm_mmlu/mistralai/mistral-small-2402/{d277cca3-64da-4e4b-9210-3f5b910c975c.json => 85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json} (82%)
 rename data/helm_mmlu/mistralai/mixtral-8x22b/{cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json => df568c3c-8a5c-4455-836d-c980d7f5ea5c.json} (82%)
 rename data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/{0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json => 96e24977-ca6d-402c-bfd8-62be4cd9b902.json} (82%)
 rename data/helm_mmlu/mistralai/open-mistral-nemo-2407/{87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json => e5b2636a-8438-40c0-9f89-9f35585bf740.json} (82%)
 rename data/helm_mmlu/openai/gpt-3.5-turbo-0125/{48a0dd6b-9304-460a-8e4e-420c60dfa854.json => f3259d92-3c95-4b78-81ae-f7f4b80aec63.json} (82%)
 rename data/helm_mmlu/openai/gpt-3.5-turbo-0613/{1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json => 5ba23a34-4232-487f-b3e9-326d776135be.json} (82%)
 rename data/helm_mmlu/openai/gpt-4-0613/{8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json => 5bc1a462-f753-4259-91c3-a549491b2986.json} (82%)
 rename data/helm_mmlu/openai/gpt-4-1106-preview/{174ad35c-d6b5-49bd-930c-9c83608213a9.json => 16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json} (82%)
 rename data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/{348bbc24-09de-4d1e-98bc-079e87fea558.json => dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json} (82%)
 rename data/helm_mmlu/openai/gpt-4o-2024-05-13/{f37fc452-58f2-4d80-a71c-9331f7fe549e.json => 2ca11d4c-52e6-49ea-a5cb-238c0313c483.json} (82%)
 rename data/helm_mmlu/openai/gpt-4o-2024-08-06/{71df45d2-1a27-4ff2-853c-e853f809ff52.json => de400624-6c2e-47af-b851-54c4075c30ee.json} (82%)
 rename data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/{7c049135-a8bc-46ca-9a85-cba23e8696fd.json => 34441b3b-4d66-444c-af85-ca0666a48ed4.json} (82%)
 rename data/helm_mmlu/qwen/qwen1.5-110b-chat/{69737d19-682b-494f-b10b-fb788e83076b.json => eecf5e40-9110-47ea-a72b-9ba587b96e30.json} (82%)
 rename data/helm_mmlu/qwen/qwen1.5-14b/{c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json => f26fb123-c214-4d18-aea8-b05b4ea1819b.json} (82%)
 rename data/helm_mmlu/qwen/qwen1.5-32b/{ed668c03-e5df-4871-b2fa-876b2cda62f3.json => 30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json} (82%)
 rename data/helm_mmlu/qwen/qwen1.5-72b/{c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json => b152cd5c-cbc0-48f4-ba37-16878c3afba1.json} (82%)
 rename data/helm_mmlu/qwen/qwen1.5-7b/{1c743b00-0ca6-4332-9bb6-7f62190d74e3.json => dac223e9-3073-46f9-924b-c5a6408f5da9.json} (82%)
 rename data/helm_mmlu/qwen/qwen2-72b-instruct/{7f9317d3-b2bc-481d-9b28-9f305612ac58.json => a7a218ff-7afe-417c-ac39-cf305d592d56.json} (82%)
 rename data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/{7b3bc40a-a606-419d-b784-99697c1df5bc.json => 2e165735-43b8-4317-9cde-35aa4b5bcb26.json} (82%)
 rename data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/{d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json => 15c25bc5-7b1e-4771-bda2-fd04d74e1463.json} (82%)
 rename data/helm_mmlu/snowflake/snowflake-arctic-instruct/{cc68185c-6ee2-40bd-8951-f104d898c7f8.json => 26036c7c-e981-46e8-b5e9-dcd7d116af70.json} (82%)
 rename data/helm_mmlu/upstage/solar-pro-241126/{78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json => b3269e4e-98a7-4795-8ef3-fc87774a54b7.json} (82%)
 rename data/helm_mmlu/writer/palmyra-x-004/{ba74f375-fd6d-4bba-af63-605bd73c9b7f.json => 284fde9f-8570-4e6d-9190-e52d8723fe57.json} (82%)
 rename data/helm_mmlu/writer/palmyra-x-v3/{41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json => fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json} (82%)
 create mode 100755 scripts/HELM/parse_helm_leaderboards.sh

diff --git a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/18881f8b-b06e-4317-b697-6eadb975077c.json b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/bd982107-7c03-4ee8-8a38-782d68883818.json
similarity index 80%
rename from data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/18881f8b-b06e-4317-b697-6eadb975077c.json
rename to data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/bd982107-7c03-4ee8-8a38-782d68883818.json
index 42f19b810..28c2132cc 100644
--- a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/18881f8b-b06e-4317-b697-6eadb975077c.json
+++ b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/bd982107-7c03-4ee8-8a38-782d68883818.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.475,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,14 +100,23 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -141,14 +162,23 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -194,11 +224,20 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -244,12 +283,21 @@
         }
       },
       "generation_config": {
-        "subset": "v2",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "v2",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -295,7 +343,9 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     }
   ]
diff --git a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/97db1a8d-b7d8-4481-82fb-dc0c6396edac.json b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/25aa6e41-ab16-4f63-9613-bfb83b9151c5.json
similarity index 80%
rename from data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/97db1a8d-b7d8-4481-82fb-dc0c6396edac.json
rename to data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/25aa6e41-ab16-4f63-9613-bfb83b9151c5.json
index c596a8093..c2c0ac804 100644
--- a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/97db1a8d-b7d8-4481-82fb-dc0c6396edac.json
+++ b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/25aa6e41-ab16-4f63-9613-bfb83b9151c5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.44,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,14 +100,23 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -141,14 +162,23 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -194,11 +224,20 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -244,12 +283,21 @@
         }
       },
       "generation_config": {
-        "subset": "v2",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "v2",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -295,7 +343,9 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     }
   ]
diff --git a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/8d29f447-01d8-4fae-87d5-b4386ce5239a.json b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/ddd52881-1248-4652-9f1d-5d2b58ede889.json
similarity index 80%
rename from data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/8d29f447-01d8-4fae-87d5-b4386ce5239a.json
rename to data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/ddd52881-1248-4652-9f1d-5d2b58ede889.json
index da8bb1b91..cbc2ce18e 100644
--- a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/8d29f447-01d8-4fae-87d5-b4386ce5239a.json
+++ b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/ddd52881-1248-4652-9f1d-5d2b58ede889.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.405,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,14 +100,23 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -141,14 +162,23 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -194,11 +224,20 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -244,12 +283,21 @@
         }
       },
       "generation_config": {
-        "subset": "v2",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "v2",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -295,7 +343,9 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     }
   ]
diff --git a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/53090373-ea82-4b63-83fd-f1d48f0400cd.json b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/365bc693-73b6-41fe-a8fa-eba7b91febe0.json
similarity index 80%
rename from data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/53090373-ea82-4b63-83fd-f1d48f0400cd.json
rename to data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/365bc693-73b6-41fe-a8fa-eba7b91febe0.json
index cb4638d3d..4bae095b1 100644
--- a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/53090373-ea82-4b63-83fd-f1d48f0400cd.json
+++ b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/365bc693-73b6-41fe-a8fa-eba7b91febe0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.332,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,14 +100,23 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -141,14 +162,23 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -194,11 +224,20 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -244,12 +283,21 @@
         }
       },
       "generation_config": {
-        "subset": "v2",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "v2",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -295,7 +343,9 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     }
   ]
diff --git a/data/helm_capabilities/amazon/nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json b/data/helm_capabilities/amazon/nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json
similarity index 80%
rename from data/helm_capabilities/amazon/nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json
rename to data/helm_capabilities/amazon/nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json
index 0670c6db3..f34e2fca2 100644
--- a/data/helm_capabilities/amazon/nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json
+++ b/data/helm_capabilities/amazon/nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.551,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/amazon/nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json b/data/helm_capabilities/amazon/nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json
similarity index 80%
rename from data/helm_capabilities/amazon/nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json
rename to data/helm_capabilities/amazon/nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json
index 2c6f0abd0..da4fca4b9 100644
--- a/data/helm_capabilities/amazon/nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json
+++ b/data/helm_capabilities/amazon/nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.522,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/amazon/nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json b/data/helm_capabilities/amazon/nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json
similarity index 81%
rename from data/helm_capabilities/amazon/nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json
rename to data/helm_capabilities/amazon/nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json
index 3a64b94b2..7d306af4a 100644
--- a/data/helm_capabilities/amazon/nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json
+++ b/data/helm_capabilities/amazon/nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.637,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/amazon/nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json b/data/helm_capabilities/amazon/nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json
similarity index 80%
rename from data/helm_capabilities/amazon/nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json
rename to data/helm_capabilities/amazon/nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json
index bbdb8512b..9634c0423 100644
--- a/data/helm_capabilities/amazon/nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json
+++ b/data/helm_capabilities/amazon/nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.591,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/568969ac-4b9a-42b0-8374-2b28dde30a3c.json b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/f350d9d1-b743-4017-bc68-a4dc726515d0.json
similarity index 80%
rename from data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/568969ac-4b9a-42b0-8374-2b28dde30a3c.json
rename to data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/f350d9d1-b743-4017-bc68-a4dc726515d0.json
index 44b7ab97a..59583f434 100644
--- a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/568969ac-4b9a-42b0-8374-2b28dde30a3c.json
+++ b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/f350d9d1-b743-4017-bc68-a4dc726515d0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Haiku (20241022)",
+    "name": "Claude 3.5 Haiku 20241022",
     "id": "anthropic/claude-3-5-haiku-20241022",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.549,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c6b92f00-6335-463d-87db-817ff85f36c8.json b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c32a1f0a-bf8a-42be-b155-4f87465235bc.json
similarity index 80%
rename from data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c6b92f00-6335-463d-87db-817ff85f36c8.json
rename to data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c32a1f0a-bf8a-42be-b155-4f87465235bc.json
index b8e94bdb5..050628b1e 100644
--- a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c6b92f00-6335-463d-87db-817ff85f36c8.json
+++ b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c32a1f0a-bf8a-42be-b155-4f87465235bc.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Sonnet (20241022)",
+    "name": "Claude 3.5 Sonnet 20241022",
     "id": "anthropic/claude-3-5-sonnet-20241022",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.653,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/460fdbd2-a164-4af4-95ff-db66e381ca0c.json b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/96cfde1b-77de-4d2a-8b45-938116795108.json
similarity index 80%
rename from data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/460fdbd2-a164-4af4-95ff-db66e381ca0c.json
rename to data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/96cfde1b-77de-4d2a-8b45-938116795108.json
index a41bf85dc..325dd380e 100644
--- a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/460fdbd2-a164-4af4-95ff-db66e381ca0c.json
+++ b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/96cfde1b-77de-4d2a-8b45-938116795108.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.7 Sonnet (20250219)",
+    "name": "Claude 3.7 Sonnet 20250219",
     "id": "anthropic/claude-3-7-sonnet-20250219",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.674,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json
new file mode 100644
index 000000000..82dc8fad1
--- /dev/null
+++ b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-haiku-4-5-20251001/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "Claude 4.5 Haiku 20251001",
+    "id": "anthropic/claude-haiku-4-5-20251001",
+    "developer": "anthropic",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.717,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 7.381503096938465
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.777,
+        "details": {
+          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=3.701, mean=3.701, max=3.701, sum=3.701 (1)",
+            "tab": "Efficiency",
+            "score": 3.7008020806312563
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)",
+            "tab": "General information",
+            "score": 252.461
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=374.129, mean=374.129, max=374.129, sum=374.129 (1)",
+            "tab": "General information",
+            "score": 374.129
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.605,
+        "details": {
+          "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=5.102, mean=5.102, max=5.102, sum=5.102 (1)",
+            "tab": "Efficiency",
+            "score": 5.102193982611857
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)",
+            "tab": "General information",
+            "score": 272.73766816143495
+          },
+          "GPQA - # output tokens": {
+            "description": "min=524.525, mean=524.525, max=524.525, sum=524.525 (1)",
+            "tab": "General information",
+            "score": 524.5246636771301
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.801,
+        "details": {
+          "description": "min=0.801, mean=0.801, max=0.801, sum=0.801 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=4.355, mean=4.355, max=4.355, sum=4.355 (1)",
+            "tab": "Efficiency",
+            "score": 4.355410516372229
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
+            "tab": "General information",
+            "score": 47.15896487985213
+          },
+          "IFEval - # output tokens": {
+            "description": "min=390.416, mean=390.416, max=390.416, sum=390.416 (1)",
+            "tab": "General information",
+            "score": 390.4158964879852
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.839,
+        "details": {
+          "description": "min=0.839, mean=0.839, max=0.839, sum=0.839 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=16.317, mean=16.317, max=16.317, sum=16.317 (1)",
+            "tab": "Efficiency",
+            "score": 16.317131044387818
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=1835.337, mean=1835.337, max=1835.337, sum=1835.337 (1)",
+            "tab": "General information",
+            "score": 1835.337
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.561,
+        "details": {
+          "description": "min=0.561, mean=0.561, max=0.561, sum=0.561 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=7.432, mean=7.432, max=7.432, sum=7.432 (1)",
+            "tab": "Efficiency",
+            "score": 7.431977860689163
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
+            "tab": "General information",
+            "score": 110.563
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=937.799, mean=937.799, max=937.799, sum=937.799 (1)",
+            "tab": "General information",
+            "score": 937.799
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/cb21169b-04ff-47d1-92dd-5b5f2e09b863.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/d633fcd6-eb01-49ff-ba7c-6ca12734746f.json
similarity index 80%
rename from data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/cb21169b-04ff-47d1-92dd-5b5f2e09b863.json
rename to data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/d633fcd6-eb01-49ff-ba7c-6ca12734746f.json
index a9349e9cb..0e6c52fbd 100644
--- a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/cb21169b-04ff-47d1-92dd-5b5f2e09b863.json
+++ b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/d633fcd6-eb01-49ff-ba7c-6ca12734746f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 4 Opus (20250514, extended thinking)",
+    "name": "Claude 4 Opus 20250514, extended thinking",
     "id": "anthropic/claude-opus-4-20250514-thinking-10k",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.78,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514/2168d830-ad6b-4aee-94f0-7ec8fd403a49.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514/7a7b49ff-5060-4d12-acb9-607125fbe081.json
similarity index 80%
rename from data/helm_capabilities/anthropic/claude-opus-4-20250514/2168d830-ad6b-4aee-94f0-7ec8fd403a49.json
rename to data/helm_capabilities/anthropic/claude-opus-4-20250514/7a7b49ff-5060-4d12-acb9-607125fbe081.json
index c82ca8963..7abaf15ac 100644
--- a/data/helm_capabilities/anthropic/claude-opus-4-20250514/2168d830-ad6b-4aee-94f0-7ec8fd403a49.json
+++ b/data/helm_capabilities/anthropic/claude-opus-4-20250514/7a7b49ff-5060-4d12-acb9-607125fbe081.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 4 Opus (20250514)",
+    "name": "Claude 4 Opus 20250514",
     "id": "anthropic/claude-opus-4-20250514",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.757,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/287a3646-d969-4bd9-9773-86463c1ba87f.json
similarity index 80%
rename from data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json
rename to data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/287a3646-d969-4bd9-9773-86463c1ba87f.json
index 6bf01f358..f65747fef 100644
--- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json
+++ b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/287a3646-d969-4bd9-9773-86463c1ba87f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 4 Sonnet (20250514, extended thinking)",
+    "name": "Claude 4 Sonnet 20250514, extended thinking",
     "id": "anthropic/claude-sonnet-4-20250514-thinking-10k",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.766,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/629d5de7-25ed-4088-aca6-7fb53719f4a4.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/97f3892f-9588-49ef-abef-3a0c965bb352.json
similarity index 80%
rename from data/helm_capabilities/anthropic/claude-sonnet-4-20250514/629d5de7-25ed-4088-aca6-7fb53719f4a4.json
rename to data/helm_capabilities/anthropic/claude-sonnet-4-20250514/97f3892f-9588-49ef-abef-3a0c965bb352.json
index af4facce4..98193fa4e 100644
--- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/629d5de7-25ed-4088-aca6-7fb53719f4a4.json
+++ b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/97f3892f-9588-49ef-abef-3a0c965bb352.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 4 Sonnet (20250514)",
+    "name": "Claude 4 Sonnet 20250514",
     "id": "anthropic/claude-sonnet-4-20250514",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.733,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json
new file mode 100644
index 000000000..3583acbb0
--- /dev/null
+++ b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-5-20250929/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "Claude 4.5 Sonnet 20250929",
+    "id": "anthropic/claude-sonnet-4-5-20250929",
+    "developer": "anthropic",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.762,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 17.536448448412127
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.869,
+        "details": {
+          "description": "min=0.869, mean=0.869, max=0.869, sum=0.869 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=9.03, mean=9.03, max=9.03, sum=9.03 (1)",
+            "tab": "Efficiency",
+            "score": 9.029817205530268
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)",
+            "tab": "General information",
+            "score": 252.461
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=392.292, mean=392.292, max=392.292, sum=392.292 (1)",
+            "tab": "General information",
+            "score": 392.292
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.686,
+        "details": {
+          "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=12.414, mean=12.414, max=12.414, sum=12.414 (1)",
+            "tab": "Efficiency",
+            "score": 12.414452127318263
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)",
+            "tab": "General information",
+            "score": 272.73766816143495
+          },
+          "GPQA - # output tokens": {
+            "description": "min=544.215, mean=544.215, max=544.215, sum=544.215 (1)",
+            "tab": "General information",
+            "score": 544.2152466367713
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.85,
+        "details": {
+          "description": "min=0.85, mean=0.85, max=0.85, sum=0.85 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=10.904, mean=10.904, max=10.904, sum=10.904 (1)",
+            "tab": "Efficiency",
+            "score": 10.90394415211986
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
+            "tab": "General information",
+            "score": 47.15896487985213
+          },
+          "IFEval - # output tokens": {
+            "description": "min=414.632, mean=414.632, max=414.632, sum=414.632 (1)",
+            "tab": "General information",
+            "score": 414.63216266173754
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.854,
+        "details": {
+          "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=38.544, mean=38.544, max=38.544, sum=38.544 (1)",
+            "tab": "Efficiency",
+            "score": 38.54364204096484
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=1804.604, mean=1804.604, max=1804.604, sum=1804.604 (1)",
+            "tab": "General information",
+            "score": 1804.604
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.553,
+        "details": {
+          "description": "min=0.553, mean=0.553, max=0.553, sum=0.553 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=16.79, mean=16.79, max=16.79, sum=16.79 (1)",
+            "tab": "Efficiency",
+            "score": 16.790386716127397
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
+            "tab": "General information",
+            "score": 110.563
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=892.774, mean=892.774, max=892.774, sum=892.774 (1)",
+            "tab": "General information",
+            "score": 892.774
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/9e5684dc-6380-4353-b966-7205d66340fa.json
similarity index 81%
rename from data/helm_capabilities/deepseek-ai/deepseek-r1-0528/fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json
rename to data/helm_capabilities/deepseek-ai/deepseek-r1-0528/9e5684dc-6380-4353-b966-7205d66340fa.json
index 0b36b4b41..6cc5a7f14 100644
--- a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json
+++ b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/9e5684dc-6380-4353-b966-7205d66340fa.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.699,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/deepseek-ai/deepseek-v3/d031935b-2b54-4940-a852-dad1f10fc396.json b/data/helm_capabilities/deepseek-ai/deepseek-v3/1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json
similarity index 81%
rename from data/helm_capabilities/deepseek-ai/deepseek-v3/d031935b-2b54-4940-a852-dad1f10fc396.json
rename to data/helm_capabilities/deepseek-ai/deepseek-v3/1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json
index 3502a2f83..46c4843d4 100644
--- a/data/helm_capabilities/deepseek-ai/deepseek-v3/d031935b-2b54-4940-a852-dad1f10fc396.json
+++ b/data/helm_capabilities/deepseek-ai/deepseek-v3/1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.665,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-1.5-flash-002/b79010aa-d441-4850-b656-52ce6587dab8.json b/data/helm_capabilities/google/gemini-1.5-flash-002/20512a3b-ac0f-483a-8bec-9962980c579c.json
similarity index 80%
rename from data/helm_capabilities/google/gemini-1.5-flash-002/b79010aa-d441-4850-b656-52ce6587dab8.json
rename to data/helm_capabilities/google/gemini-1.5-flash-002/20512a3b-ac0f-483a-8bec-9962980c579c.json
index 9cecc3e6e..26e2e73d6 100644
--- a/data/helm_capabilities/google/gemini-1.5-flash-002/b79010aa-d441-4850-b656-52ce6587dab8.json
+++ b/data/helm_capabilities/google/gemini-1.5-flash-002/20512a3b-ac0f-483a-8bec-9962980c579c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Flash (002)",
+    "name": "Gemini 1.5 Flash 002",
     "id": "google/gemini-1.5-flash-002",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.609,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-1.5-pro-002/dde5a36d-f14b-482d-86db-74bdb3771e38.json b/data/helm_capabilities/google/gemini-1.5-pro-002/704c5c74-a0ee-457d-9b4e-3ae895ffc105.json
similarity index 80%
rename from data/helm_capabilities/google/gemini-1.5-pro-002/dde5a36d-f14b-482d-86db-74bdb3771e38.json
rename to data/helm_capabilities/google/gemini-1.5-pro-002/704c5c74-a0ee-457d-9b4e-3ae895ffc105.json
index c41c3cf10..1157dc164 100644
--- a/data/helm_capabilities/google/gemini-1.5-pro-002/dde5a36d-f14b-482d-86db-74bdb3771e38.json
+++ b/data/helm_capabilities/google/gemini-1.5-pro-002/704c5c74-a0ee-457d-9b4e-3ae895ffc105.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Pro (002)",
+    "name": "Gemini 1.5 Pro 002",
     "id": "google/gemini-1.5-pro-002",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.657,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.0-flash-001/981ba423-a1d2-4577-9f61-9c4b8b430b58.json b/data/helm_capabilities/google/gemini-2.0-flash-001/eb9224b8-0edb-4605-a2ee-cfb63f41370e.json
similarity index 81%
rename from data/helm_capabilities/google/gemini-2.0-flash-001/981ba423-a1d2-4577-9f61-9c4b8b430b58.json
rename to data/helm_capabilities/google/gemini-2.0-flash-001/eb9224b8-0edb-4605-a2ee-cfb63f41370e.json
index 963d02bef..68450c9bd 100644
--- a/data/helm_capabilities/google/gemini-2.0-flash-001/981ba423-a1d2-4577-9f61-9c4b8b430b58.json
+++ b/data/helm_capabilities/google/gemini-2.0-flash-001/eb9224b8-0edb-4605-a2ee-cfb63f41370e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.679,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/56ddcce9-fc1c-476f-96c8-65a7d732c95b.json b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/4cb58f80-c2b1-45c6-b781-19af47660eb0.json
similarity index 80%
rename from data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/56ddcce9-fc1c-476f-96c8-65a7d732c95b.json
rename to data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/4cb58f80-c2b1-45c6-b781-19af47660eb0.json
index 87e886284..1bc6a5842 100644
--- a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/56ddcce9-fc1c-476f-96c8-65a7d732c95b.json
+++ b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/4cb58f80-c2b1-45c6-b781-19af47660eb0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 2.0 Flash Lite (02-05 preview)",
+    "name": "Gemini 2.0 Flash Lite 02-05 preview",
     "id": "google/gemini-2.0-flash-lite-preview-02-05",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.642,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.5-flash-lite/22da4909-8b3b-49f3-940f-8764509725f8.json b/data/helm_capabilities/google/gemini-2.5-flash-lite/6307e0c4-c983-4257-82d8-b2a50171eb8a.json
similarity index 81%
rename from data/helm_capabilities/google/gemini-2.5-flash-lite/22da4909-8b3b-49f3-940f-8764509725f8.json
rename to data/helm_capabilities/google/gemini-2.5-flash-lite/6307e0c4-c983-4257-82d8-b2a50171eb8a.json
index a5294b486..f9f820a96 100644
--- a/data/helm_capabilities/google/gemini-2.5-flash-lite/22da4909-8b3b-49f3-940f-8764509725f8.json
+++ b/data/helm_capabilities/google/gemini-2.5-flash-lite/6307e0c4-c983-4257-82d8-b2a50171eb8a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.591,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/a6b3d596-d204-4cb7-a3e4-4e717537b76a.json b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/275cd615-bddf-4afe-a499-b463fe183486.json
similarity index 80%
rename from data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/a6b3d596-d204-4cb7-a3e4-4e717537b76a.json
rename to data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/275cd615-bddf-4afe-a499-b463fe183486.json
index d0e1ed757..7f7987a29 100644
--- a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/a6b3d596-d204-4cb7-a3e4-4e717537b76a.json
+++ b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/275cd615-bddf-4afe-a499-b463fe183486.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 2.5 Flash (04-17 preview)",
+    "name": "Gemini 2.5 Flash 04-17 preview",
     "id": "google/gemini-2.5-flash-preview-04-17",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.626,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/eaa18be0-1195-4344-9673-efa8c555456d.json b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/03b48360-a387-44ba-94b2-2eb7c234a9fa.json
similarity index 80%
rename from data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/eaa18be0-1195-4344-9673-efa8c555456d.json
rename to data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/03b48360-a387-44ba-94b2-2eb7c234a9fa.json
index f1093c814..c845227fa 100644
--- a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/eaa18be0-1195-4344-9673-efa8c555456d.json
+++ b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/03b48360-a387-44ba-94b2-2eb7c234a9fa.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 2.5 Pro (03-25 preview)",
+    "name": "Gemini 2.5 Pro 03-25 preview",
     "id": "google/gemini-2.5-pro-preview-03-25",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.745,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json b/data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json
new file mode 100644
index 000000000..e4e82cd5b
--- /dev/null
+++ b/data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/google_gemini-3-pro-preview/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "Gemini 3 Pro Preview",
+    "id": "google/gemini-3-pro-preview",
+    "developer": "google",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.799,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 50.969324812798575
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.903,
+        "details": {
+          "description": "min=0.903, mean=0.903, max=0.903, sum=0.903 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=34.903, mean=34.903, max=34.903, sum=34.903 (1)",
+            "tab": "Efficiency",
+            "score": 34.903078527212145
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=263.673, mean=263.673, max=263.673, sum=263.673 (1)",
+            "tab": "General information",
+            "score": 263.673
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.803,
+        "details": {
+          "description": "min=0.803, mean=0.803, max=0.803, sum=0.803 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=69.164, mean=69.164, max=69.164, sum=69.164 (1)",
+            "tab": "Efficiency",
+            "score": 69.16407415364355
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=273.735, mean=273.735, max=273.735, sum=273.735 (1)",
+            "tab": "General information",
+            "score": 273.7354260089686
+          },
+          "GPQA - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.876,
+        "details": {
+          "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=18.201, mean=18.201, max=18.201, sum=18.201 (1)",
+            "tab": "Efficiency",
+            "score": 18.200553727458452
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)",
+            "tab": "General information",
+            "score": 47.33086876155268
+          },
+          "IFEval - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.859,
+        "details": {
+          "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=37.094, mean=37.094, max=37.094, sum=37.094 (1)",
+            "tab": "Efficiency",
+            "score": 37.09404513451669
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.555,
+        "details": {
+          "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=95.485, mean=95.485, max=95.485, sum=95.485 (1)",
+            "tab": "Efficiency",
+            "score": 95.48487252116203
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)",
+            "tab": "General information",
+            "score": 111.956
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/0ae30d3c-098c-434f-985b-58e8179148a6.json b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5e5720d0-67fe-40a9-b65b-d4154848d83c.json
similarity index 81%
rename from data/helm_capabilities/ibm/granite-3.3-8b-instruct/0ae30d3c-098c-434f-985b-58e8179148a6.json
rename to data/helm_capabilities/ibm/granite-3.3-8b-instruct/5e5720d0-67fe-40a9-b65b-d4154848d83c.json
index 42be38419..828363b5a 100644
--- a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/0ae30d3c-098c-434f-985b-58e8179148a6.json
+++ b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5e5720d0-67fe-40a9-b65b-d4154848d83c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.463,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json b/data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json
new file mode 100644
index 000000000..8203eb4c6
--- /dev/null
+++ b/data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/ibm_granite-4.0-h-small/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "IBM Granite 4.0 Small",
+    "id": "ibm/granite-4.0-h-small",
+    "developer": "ibm",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.575,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 21.31162992088884
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.569,
+        "details": {
+          "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=12.071, mean=12.071, max=12.071, sum=12.071 (1)",
+            "tab": "Efficiency",
+            "score": 12.070928404092788
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=288.391, mean=288.391, max=288.391, sum=288.391 (1)",
+            "tab": "General information",
+            "score": 288.391
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=372.93, mean=372.93, max=372.93, sum=372.93 (1)",
+            "tab": "General information",
+            "score": 372.93
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.383,
+        "details": {
+          "description": "min=0.383, mean=0.383, max=0.383, sum=0.383 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=17.606, mean=17.606, max=17.606, sum=17.606 (1)",
+            "tab": "Efficiency",
+            "score": 17.606201725690354
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=303.265, mean=303.265, max=303.265, sum=303.265 (1)",
+            "tab": "General information",
+            "score": 303.2645739910314
+          },
+          "GPQA - # output tokens": {
+            "description": "min=439.648, mean=439.648, max=439.648, sum=439.648 (1)",
+            "tab": "General information",
+            "score": 439.6479820627803
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.89,
+        "details": {
+          "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=13.366, mean=13.366, max=13.366, sum=13.366 (1)",
+            "tab": "Efficiency",
+            "score": 13.366226098453712
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)",
+            "tab": "General information",
+            "score": 51.53419593345656
+          },
+          "IFEval - # output tokens": {
+            "description": "min=494.717, mean=494.717, max=494.717, sum=494.717 (1)",
+            "tab": "General information",
+            "score": 494.7171903881701
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.739,
+        "details": {
+          "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=30.807, mean=30.807, max=30.807, sum=30.807 (1)",
+            "tab": "Efficiency",
+            "score": 30.80672695994377
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=996.159, mean=996.159, max=996.159, sum=996.159 (1)",
+            "tab": "General information",
+            "score": 996.159
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.296,
+        "details": {
+          "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=32.708, mean=32.708, max=32.708, sum=32.708 (1)",
+            "tab": "Efficiency",
+            "score": 32.70806641626358
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)",
+            "tab": "General information",
+            "score": 118.438
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=1020.51, mean=1020.51, max=1020.51, sum=1020.51 (1)",
+            "tab": "General information",
+            "score": 1020.51
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json b/data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json
new file mode 100644
index 000000000..bfe399026
--- /dev/null
+++ b/data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/ibm_granite-4.0-micro/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "IBM Granite 4.0 Micro",
+    "id": "ibm/granite-4.0-micro",
+    "developer": "ibm",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.486,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 5.725128505637726
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.395,
+        "details": {
+          "description": "min=0.395, mean=0.395, max=0.395, sum=0.395 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=3.135, mean=3.135, max=3.135, sum=3.135 (1)",
+            "tab": "Efficiency",
+            "score": 3.1348352246284485
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=288.391, mean=288.391, max=288.391, sum=288.391 (1)",
+            "tab": "General information",
+            "score": 288.391
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=325.255, mean=325.255, max=325.255, sum=325.255 (1)",
+            "tab": "General information",
+            "score": 325.255
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.307,
+        "details": {
+          "description": "min=0.307, mean=0.307, max=0.307, sum=0.307 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=3.075, mean=3.075, max=3.075, sum=3.075 (1)",
+            "tab": "Efficiency",
+            "score": 3.075281912970436
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=303.265, mean=303.265, max=303.265, sum=303.265 (1)",
+            "tab": "General information",
+            "score": 303.2645739910314
+          },
+          "GPQA - # output tokens": {
+            "description": "min=337.417, mean=337.417, max=337.417, sum=337.417 (1)",
+            "tab": "General information",
+            "score": 337.4170403587444
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.849,
+        "details": {
+          "description": "min=0.849, mean=0.849, max=0.849, sum=0.849 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=4.58, mean=4.58, max=4.58, sum=4.58 (1)",
+            "tab": "Efficiency",
+            "score": 4.580414981806785
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)",
+            "tab": "General information",
+            "score": 51.53419593345656
+          },
+          "IFEval - # output tokens": {
+            "description": "min=497.8, mean=497.8, max=497.8, sum=497.8 (1)",
+            "tab": "General information",
+            "score": 497.8003696857671
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.67,
+        "details": {
+          "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=8.161, mean=8.161, max=8.161, sum=8.161 (1)",
+            "tab": "Efficiency",
+            "score": 8.160923891305924
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=1037.706, mean=1037.706, max=1037.706, sum=1037.706 (1)",
+            "tab": "General information",
+            "score": 1037.706
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.209,
+        "details": {
+          "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=9.674, mean=9.674, max=9.674, sum=9.674 (1)",
+            "tab": "Efficiency",
+            "score": 9.674186517477036
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)",
+            "tab": "General information",
+            "score": 118.438
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=1145.889, mean=1145.889, max=1145.889, sum=1145.889 (1)",
+            "tab": "General information",
+            "score": 1145.889
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/marin-community/marin-8b-instruct/cc90bae5-b964-4402-9edb-5427663f01fb.json b/data/helm_capabilities/marin-community/marin-8b-instruct/aba1fded-b031-48df-87ef-dc744df33501.json
similarity index 80%
rename from data/helm_capabilities/marin-community/marin-8b-instruct/cc90bae5-b964-4402-9edb-5427663f01fb.json
rename to data/helm_capabilities/marin-community/marin-8b-instruct/aba1fded-b031-48df-87ef-dc744df33501.json
index 3622da7c6..215be80f3 100644
--- a/data/helm_capabilities/marin-community/marin-8b-instruct/cc90bae5-b964-4402-9edb-5427663f01fb.json
+++ b/data/helm_capabilities/marin-community/marin-8b-instruct/aba1fded-b031-48df-87ef-dc744df33501.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.325,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,14 +100,23 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -141,14 +162,23 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -194,11 +224,20 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -244,12 +283,21 @@
         }
       },
       "generation_config": {
-        "subset": "v2",
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "subset": "v2",
+          "num_output_tokens": "2048"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -295,7 +343,9 @@
         }
       },
       "generation_config": {
-        "num_output_tokens": "2048"
+        "additional_details": {
+          "num_output_tokens": "2048"
+        }
       }
     }
   ]
diff --git a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/98f69aa6-b227-4076-a76e-1293cbe1c6cb.json
similarity index 80%
rename from data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json
rename to data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/98f69aa6-b227-4076-a76e-1293cbe1c6cb.json
index 6e7a59864..41fd4d1af 100644
--- a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json
+++ b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/98f69aa6-b227-4076-a76e-1293cbe1c6cb.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (405B)",
+    "name": "Llama 3.1 Instruct Turbo 405B",
     "id": "meta/llama-3.1-405b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.618,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/d2bb087e-a275-4fce-b6dc-001fd4545883.json
similarity index 80%
rename from data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json
rename to data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/d2bb087e-a275-4fce-b6dc-001fd4545883.json
index 9ba719da5..7e6e617b7 100644
--- a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json
+++ b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/d2bb087e-a275-4fce-b6dc-001fd4545883.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (70B)",
+    "name": "Llama 3.1 Instruct Turbo 70B",
     "id": "meta/llama-3.1-70b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.574,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/930db2c4-d9c5-4e38-ae80-7304c2f10611.json b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json
similarity index 80%
rename from data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/930db2c4-d9c5-4e38-ae80-7304c2f10611.json
rename to data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json
index 4657892fd..0c2bb79e7 100644
--- a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/930db2c4-d9c5-4e38-ae80-7304c2f10611.json
+++ b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (8B)",
+    "name": "Llama 3.1 Instruct Turbo 8B",
     "id": "meta/llama-3.1-8b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.444,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json
similarity index 80%
rename from data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json
rename to data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json
index 9c2141acc..71c8e88c3 100644
--- a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json
+++ b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 4 Maverick (17Bx128E) Instruct FP8",
+    "name": "Llama 4 Maverick 17Bx128E Instruct FP8",
     "id": "meta/llama-4-maverick-17b-128e-instruct-fp8",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.718,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/bb4e408d-505e-46c8-bd0c-7afa44a96498.json b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/9cab3a77-4f32-48d0-ba11-e2323ccc4861.json
similarity index 80%
rename from data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/bb4e408d-505e-46c8-bd0c-7afa44a96498.json
rename to data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/9cab3a77-4f32-48d0-ba11-e2323ccc4861.json
index 2d19156dc..35aef174b 100644
--- a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/bb4e408d-505e-46c8-bd0c-7afa44a96498.json
+++ b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/9cab3a77-4f32-48d0-ba11-e2323ccc4861.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 4 Scout (17Bx16E) Instruct",
+    "name": "Llama 4 Scout 17Bx16E Instruct",
     "id": "meta/llama-4-scout-17b-16e-instruct",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.644,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/d63dad7a-f7b7-4c87-9712-3043fc117545.json b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9e037c92-1253-49be-b31a-3aa017531d77.json
similarity index 80%
rename from data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/d63dad7a-f7b7-4c87-9712-3043fc117545.json
rename to data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9e037c92-1253-49be-b31a-3aa017531d77.json
index 6663598e4..ee064ad73 100644
--- a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/d63dad7a-f7b7-4c87-9712-3043fc117545.json
+++ b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9e037c92-1253-49be-b31a-3aa017531d77.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Instruct v0.3 (7B)",
+    "name": "Mistral Instruct v0.3 7B",
     "id": "mistralai/mistral-7b-instruct-v0.3",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.376,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mistral-large-2411/7e7f739e-9363-4c41-871d-6cf6c4145728.json b/data/helm_capabilities/mistralai/mistral-large-2411/bd26c7cb-ce76-4b17-b617-d1d93a168c93.json
similarity index 81%
rename from data/helm_capabilities/mistralai/mistral-large-2411/7e7f739e-9363-4c41-871d-6cf6c4145728.json
rename to data/helm_capabilities/mistralai/mistral-large-2411/bd26c7cb-ce76-4b17-b617-d1d93a168c93.json
index db1fa9b82..f4fd3ec06 100644
--- a/data/helm_capabilities/mistralai/mistral-large-2411/7e7f739e-9363-4c41-871d-6cf6c4145728.json
+++ b/data/helm_capabilities/mistralai/mistral-large-2411/bd26c7cb-ce76-4b17-b617-d1d93a168c93.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Large (2411)",
+    "name": "Mistral Large 2411",
     "id": "mistralai/mistral-large-2411",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.598,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mistral-small-2503/853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json b/data/helm_capabilities/mistralai/mistral-small-2503/9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json
similarity index 80%
rename from data/helm_capabilities/mistralai/mistral-small-2503/853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json
rename to data/helm_capabilities/mistralai/mistral-small-2503/9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json
index 69ce74931..ff90f0105 100644
--- a/data/helm_capabilities/mistralai/mistral-small-2503/853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json
+++ b/data/helm_capabilities/mistralai/mistral-small-2503/9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Small 3.1 (2503)",
+    "name": "Mistral Small 3.1 2503",
     "id": "mistralai/mistral-small-2503",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.558,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/b05befca-44a5-45fb-823e-84bcc3ae81d0.json b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/d69a1cbe-353c-4be9-b93b-5224d24c7adf.json
similarity index 80%
rename from data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/b05befca-44a5-45fb-823e-84bcc3ae81d0.json
rename to data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/d69a1cbe-353c-4be9-b93b-5224d24c7adf.json
index 2dfb94872..703963331 100644
--- a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/b05befca-44a5-45fb-823e-84bcc3ae81d0.json
+++ b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/d69a1cbe-353c-4be9-b93b-5224d24c7adf.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mixtral Instruct (8x22B)",
+    "name": "Mixtral Instruct 8x22B",
     "id": "mistralai/mixtral-8x22b-instruct-v0.1",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.478,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/915cb39d-f21f-4ef1-a95f-f44f79ede893.json
similarity index 80%
rename from data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json
rename to data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/915cb39d-f21f-4ef1-a95f-f44f79ede893.json
index 293d11168..c522fd879 100644
--- a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json
+++ b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/915cb39d-f21f-4ef1-a95f-f44f79ede893.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mixtral Instruct (8x7B)",
+    "name": "Mixtral Instruct 8x7B",
     "id": "mistralai/mixtral-8x7b-instruct-v0.1",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.397,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/moonshotai/kimi-k2-instruct/eaeab0d7-4418-4699-9774-bc1c6711b3d3.json b/data/helm_capabilities/moonshotai/kimi-k2-instruct/fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json
similarity index 81%
rename from data/helm_capabilities/moonshotai/kimi-k2-instruct/eaeab0d7-4418-4699-9774-bc1c6711b3d3.json
rename to data/helm_capabilities/moonshotai/kimi-k2-instruct/fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json
index 4c25e86d3..b69be21a9 100644
--- a/data/helm_capabilities/moonshotai/kimi-k2-instruct/eaeab0d7-4418-4699-9774-bc1c6711b3d3.json
+++ b/data/helm_capabilities/moonshotai/kimi-k2-instruct/fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.768,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/eb51f418-6abf-4b2c-9f57-0b830c00bd15.json
similarity index 81%
rename from data/helm_capabilities/openai/gpt-4.1-2025-04-14/c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json
rename to data/helm_capabilities/openai/gpt-4.1-2025-04-14/eb51f418-6abf-4b2c-9f57-0b830c00bd15.json
index c005600e1..17443bc6f 100644
--- a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json
+++ b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/eb51f418-6abf-4b2c-9f57-0b830c00bd15.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4.1 (2025-04-14)",
+    "name": "GPT-4.1 2025-04-14",
     "id": "openai/gpt-4.1-2025-04-14",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.727,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/acaf03fd-9d4b-4fe3-8ffe-88212a786363.json b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/41cd14b0-46ba-49da-844a-19fe866bef1e.json
similarity index 80%
rename from data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/acaf03fd-9d4b-4fe3-8ffe-88212a786363.json
rename to data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/41cd14b0-46ba-49da-844a-19fe866bef1e.json
index d6481e60a..0342d7835 100644
--- a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/acaf03fd-9d4b-4fe3-8ffe-88212a786363.json
+++ b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/41cd14b0-46ba-49da-844a-19fe866bef1e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4.1 mini (2025-04-14)",
+    "name": "GPT-4.1 mini 2025-04-14",
     "id": "openai/gpt-4.1-mini-2025-04-14",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.726,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/308d3e1d-a1b9-4722-8333-23b840316e3d.json b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/7de93642-a4bc-430b-8733-9befeb6a0e23.json
similarity index 80%
rename from data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/308d3e1d-a1b9-4722-8333-23b840316e3d.json
rename to data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/7de93642-a4bc-430b-8733-9befeb6a0e23.json
index e878bf385..15a7d0356 100644
--- a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/308d3e1d-a1b9-4722-8333-23b840316e3d.json
+++ b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/7de93642-a4bc-430b-8733-9befeb6a0e23.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4.1 nano (2025-04-14)",
+    "name": "GPT-4.1 nano 2025-04-14",
     "id": "openai/gpt-4.1-nano-2025-04-14",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.616,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4o-2024-11-20/84a942b6-2b77-4bc2-859f-6b8d6be93558.json b/data/helm_capabilities/openai/gpt-4o-2024-11-20/4f18292a-1fef-4feb-9b17-045c96e3e137.json
similarity index 81%
rename from data/helm_capabilities/openai/gpt-4o-2024-11-20/84a942b6-2b77-4bc2-859f-6b8d6be93558.json
rename to data/helm_capabilities/openai/gpt-4o-2024-11-20/4f18292a-1fef-4feb-9b17-045c96e3e137.json
index ae08e8732..ed5380bd3 100644
--- a/data/helm_capabilities/openai/gpt-4o-2024-11-20/84a942b6-2b77-4bc2-859f-6b8d6be93558.json
+++ b/data/helm_capabilities/openai/gpt-4o-2024-11-20/4f18292a-1fef-4feb-9b17-045c96e3e137.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o (2024-11-20)",
+    "name": "GPT-4o 2024-11-20",
     "id": "openai/gpt-4o-2024-11-20",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.634,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7458c032-b24d-4f13-a659-b6e19d19a8e1.json
similarity index 80%
rename from data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json
rename to data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7458c032-b24d-4f13-a659-b6e19d19a8e1.json
index c3aeb8ab5..e38c0ac88 100644
--- a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json
+++ b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7458c032-b24d-4f13-a659-b6e19d19a8e1.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o mini (2024-07-18)",
+    "name": "GPT-4o mini 2024-07-18",
     "id": "openai/gpt-4o-mini-2024-07-18",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.565,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-5-2025-08-07/cb444c37-e273-4aaf-881e-8a433f630053.json b/data/helm_capabilities/openai/gpt-5-2025-08-07/21eb1648-aad0-4297-9edc-c445e4c38694.json
similarity index 81%
rename from data/helm_capabilities/openai/gpt-5-2025-08-07/cb444c37-e273-4aaf-881e-8a433f630053.json
rename to data/helm_capabilities/openai/gpt-5-2025-08-07/21eb1648-aad0-4297-9edc-c445e4c38694.json
index 2fd77c3d1..fb85b633b 100644
--- a/data/helm_capabilities/openai/gpt-5-2025-08-07/cb444c37-e273-4aaf-881e-8a433f630053.json
+++ b/data/helm_capabilities/openai/gpt-5-2025-08-07/21eb1648-aad0-4297-9edc-c445e4c38694.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-5 (2025-08-07)",
+    "name": "GPT-5 2025-08-07",
     "id": "openai/gpt-5-2025-08-07",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.807,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/7af059e2-b56e-46ed-b699-63e570081f16.json b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/99d657ae-e850-4caf-a599-13f1b8072273.json
similarity index 81%
rename from data/helm_capabilities/openai/gpt-5-mini-2025-08-07/7af059e2-b56e-46ed-b699-63e570081f16.json
rename to data/helm_capabilities/openai/gpt-5-mini-2025-08-07/99d657ae-e850-4caf-a599-13f1b8072273.json
index cf4a0414b..3ca436502 100644
--- a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/7af059e2-b56e-46ed-b699-63e570081f16.json
+++ b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/99d657ae-e850-4caf-a599-13f1b8072273.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-5 mini (2025-08-07)",
+    "name": "GPT-5 mini 2025-08-07",
     "id": "openai/gpt-5-mini-2025-08-07",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.819,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/10cd766e-442c-4b3d-833b-740417d9a6d9.json
similarity index 80%
rename from data/helm_capabilities/openai/gpt-5-nano-2025-08-07/2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json
rename to data/helm_capabilities/openai/gpt-5-nano-2025-08-07/10cd766e-442c-4b3d-833b-740417d9a6d9.json
index a9996e0cd..e271e8724 100644
--- a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json
+++ b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/10cd766e-442c-4b3d-833b-740417d9a6d9.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-5 nano (2025-08-07)",
+    "name": "GPT-5 nano 2025-08-07",
     "id": "openai/gpt-5-nano-2025-08-07",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.748,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json
new file mode 100644
index 000000000..492db1047
--- /dev/null
+++ b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-5.1-2025-11-13/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "GPT-5.1 2025-11-13",
+    "id": "openai/gpt-5.1-2025-11-13",
+    "developer": "openai",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.656,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 10.620566227529599
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.579,
+        "details": {
+          "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=1.147, mean=1.147, max=1.147, sum=1.147 (1)",
+            "tab": "Efficiency",
+            "score": 1.1470122172832489
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)",
+            "tab": "General information",
+            "score": 248.569
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=5.002, mean=5.002, max=5.002, sum=5.002 (1)",
+            "tab": "General information",
+            "score": 5.002
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.442,
+        "details": {
+          "description": "min=0.442, mean=0.442, max=0.442, sum=0.442 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=1.002, mean=1.002, max=1.002, sum=1.002 (1)",
+            "tab": "Efficiency",
+            "score": 1.002433323539426
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)",
+            "tab": "General information",
+            "score": 268.15246636771303
+          },
+          "GPQA - # output tokens": {
+            "description": "min=5.422, mean=5.422, max=5.422, sum=5.422 (1)",
+            "tab": "General information",
+            "score": 5.42152466367713
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.935,
+        "details": {
+          "description": "min=0.935, mean=0.935, max=0.935, sum=0.935 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=13.159, mean=13.159, max=13.159, sum=13.159 (1)",
+            "tab": "Efficiency",
+            "score": 13.15882584436103
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
+            "tab": "General information",
+            "score": 45.67097966728281
+          },
+          "IFEval - # output tokens": {
+            "description": "min=647.063, mean=647.063, max=647.063, sum=647.063 (1)",
+            "tab": "General information",
+            "score": 647.0628465804067
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.863,
+        "details": {
+          "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=28.081, mean=28.081, max=28.081, sum=28.081 (1)",
+            "tab": "Efficiency",
+            "score": 28.08133857488632
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=2059.716, mean=2059.716, max=2059.716, sum=2059.716 (1)",
+            "tab": "General information",
+            "score": 2059.716
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.464,
+        "details": {
+          "description": "min=0.464, mean=0.464, max=0.464, sum=0.464 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=9.713, mean=9.713, max=9.713, sum=9.713 (1)",
+            "tab": "Efficiency",
+            "score": 9.713221177577973
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
+            "tab": "General information",
+            "score": 109.623
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=1256.266, mean=1256.266, max=1256.266, sum=1256.266 (1)",
+            "tab": "General information",
+            "score": 1256.266
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-oss-120b/e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json b/data/helm_capabilities/openai/gpt-oss-120b/06719cd4-5654-49b6-9dee-e112d1601d1c.json
similarity index 80%
rename from data/helm_capabilities/openai/gpt-oss-120b/e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json
rename to data/helm_capabilities/openai/gpt-oss-120b/06719cd4-5654-49b6-9dee-e112d1601d1c.json
index 0b6f0418d..13795ec21 100644
--- a/data/helm_capabilities/openai/gpt-oss-120b/e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json
+++ b/data/helm_capabilities/openai/gpt-oss-120b/06719cd4-5654-49b6-9dee-e112d1601d1c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.77,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-oss-20b/acb07214-c0f3-4006-8a3b-23793891a1bf.json b/data/helm_capabilities/openai/gpt-oss-20b/ed849999-48c2-4569-8bcd-dc73084e3197.json
similarity index 80%
rename from data/helm_capabilities/openai/gpt-oss-20b/acb07214-c0f3-4006-8a3b-23793891a1bf.json
rename to data/helm_capabilities/openai/gpt-oss-20b/ed849999-48c2-4569-8bcd-dc73084e3197.json
index 36043d89a..d2f755b28 100644
--- a/data/helm_capabilities/openai/gpt-oss-20b/acb07214-c0f3-4006-8a3b-23793891a1bf.json
+++ b/data/helm_capabilities/openai/gpt-oss-20b/ed849999-48c2-4569-8bcd-dc73084e3197.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.674,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/o3-2025-04-16/a1c5d581-be98-4e1e-ba14-ca922bfac035.json b/data/helm_capabilities/openai/o3-2025-04-16/01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json
similarity index 80%
rename from data/helm_capabilities/openai/o3-2025-04-16/a1c5d581-be98-4e1e-ba14-ca922bfac035.json
rename to data/helm_capabilities/openai/o3-2025-04-16/01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json
index 2d017bb31..7455567bf 100644
--- a/data/helm_capabilities/openai/o3-2025-04-16/a1c5d581-be98-4e1e-ba14-ca922bfac035.json
+++ b/data/helm_capabilities/openai/o3-2025-04-16/01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "o3 (2025-04-16)",
+    "name": "o3 2025-04-16",
     "id": "openai/o3-2025-04-16",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.811,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/o4-mini-2025-04-16/c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json b/data/helm_capabilities/openai/o4-mini-2025-04-16/32382d69-21c7-43a9-bb95-27607ec18cc9.json
similarity index 80%
rename from data/helm_capabilities/openai/o4-mini-2025-04-16/c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json
rename to data/helm_capabilities/openai/o4-mini-2025-04-16/32382d69-21c7-43a9-bb95-27607ec18cc9.json
index db654a7b8..c33228ef1 100644
--- a/data/helm_capabilities/openai/o4-mini-2025-04-16/c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json
+++ b/data/helm_capabilities/openai/o4-mini-2025-04-16/32382d69-21c7-43a9-bb95-27607ec18cc9.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "o4-mini (2025-04-16)",
+    "name": "o4-mini 2025-04-16",
     "id": "openai/o4-mini-2025-04-16",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.812,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/f6d74c93-0e96-4fc5-987c-18a79dbde17c.json b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/77e702f7-37ef-4487-b047-74b13ef6d966.json
similarity index 80%
rename from data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/f6d74c93-0e96-4fc5-987c-18a79dbde17c.json
rename to data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/77e702f7-37ef-4487-b047-74b13ef6d966.json
index 7bc9ee7ae..31467bc1e 100644
--- a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/f6d74c93-0e96-4fc5-987c-18a79dbde17c.json
+++ b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/77e702f7-37ef-4487-b047-74b13ef6d966.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2.5 Instruct Turbo (72B)",
+    "name": "Qwen2.5 Instruct Turbo 72B",
     "id": "qwen/qwen2.5-72b-instruct-turbo",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.599,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/f96da103-5350-4b1b-b33e-6ced1f1f7815.json b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/4ee3c647-740c-41a6-ac66-4a38b09317ff.json
similarity index 80%
rename from data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/f96da103-5350-4b1b-b33e-6ced1f1f7815.json
rename to data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/4ee3c647-740c-41a6-ac66-4a38b09317ff.json
index 921d79480..0ac7225b8 100644
--- a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/f96da103-5350-4b1b-b33e-6ced1f1f7815.json
+++ b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/4ee3c647-740c-41a6-ac66-4a38b09317ff.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2.5 Instruct Turbo (7B)",
+    "name": "Qwen2.5 Instruct Turbo 7B",
     "id": "qwen/qwen2.5-7b-instruct-turbo",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.529,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/27bae7f2-92dd-4feb-9050-2d11c6da2d61.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/ca30726a-00a6-4228-94fe-5dce00de1d5e.json
similarity index 81%
rename from data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/27bae7f2-92dd-4feb-9050-2d11c6da2d61.json
rename to data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/ca30726a-00a6-4228-94fe-5dce00de1d5e.json
index 7bc1c5881..1d36e4190 100644
--- a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/27bae7f2-92dd-4feb-9050-2d11c6da2d61.json
+++ b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/ca30726a-00a6-4228-94fe-5dce00de1d5e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.726,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/7862890a-298b-4bda-b8f1-7be6a5779365.json
similarity index 81%
rename from data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json
rename to data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/7862890a-298b-4bda-b8f1-7be6a5779365.json
index 355119fa7..04fc2f6cc 100644
--- a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json
+++ b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/7862890a-298b-4bda-b8f1-7be6a5779365.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.798,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json
new file mode 100644
index 000000000..bbcecd669
--- /dev/null
+++ b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json
@@ -0,0 +1,345 @@
+{
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/qwen_qwen3-next-80b-a3b-thinking/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
+  "source_metadata": {
+    "source_name": "helm_capabilities",
+    "source_type": "documentation",
+    "source_organization_name": "crfm",
+    "evaluator_relationship": "third_party"
+  },
+  "model_info": {
+    "name": "Qwen3-Next 80B A3B Thinking",
+    "id": "qwen/qwen3-next-80b-a3b-thinking",
+    "developer": "qwen",
+    "inference_platform": "unknown"
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "The mean of the scores from all columns.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.7,
+        "details": {
+          "tab": "Accuracy",
+          "Mean score - Efficiency": {
+            "description": null,
+            "tab": "Efficiency",
+            "score": 27.61164260375731
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.786,
+        "details": {
+          "description": "min=0.786, mean=0.786, max=0.786, sum=0.786 (1)",
+          "tab": "Accuracy",
+          "MMLU-Pro - Observed inference time (s)": {
+            "description": "min=20.097, mean=20.097, max=20.097, sum=20.097 (1)",
+            "tab": "Efficiency",
+            "score": 20.09722422862053
+          },
+          "MMLU-Pro - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "MMLU-Pro - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "MMLU-Pro - # prompt tokens": {
+            "description": "min=259.715, mean=259.715, max=259.715, sum=259.715 (1)",
+            "tab": "General information",
+            "score": 259.715
+          },
+          "MMLU-Pro - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.63,
+        "details": {
+          "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)",
+          "tab": "Accuracy",
+          "GPQA - Observed inference time (s)": {
+            "description": "min=40.06, mean=40.06, max=40.06, sum=40.06 (1)",
+            "tab": "Efficiency",
+            "score": 40.06039341950096
+          },
+          "GPQA - # eval": {
+            "description": "min=446, mean=446, max=446, sum=446 (1)",
+            "tab": "General information",
+            "score": 446.0
+          },
+          "GPQA - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "GPQA - # prompt tokens": {
+            "description": "min=274.37, mean=274.37, max=274.37, sum=274.37 (1)",
+            "tab": "General information",
+            "score": 274.36995515695065
+          },
+          "GPQA - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
+      }
+    },
+    {
+      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.81,
+        "details": {
+          "description": "min=0.81, mean=0.81, max=0.81, sum=0.81 (1)",
+          "tab": "Accuracy",
+          "IFEval - Observed inference time (s)": {
+            "description": "min=13.893, mean=13.893, max=13.893, sum=13.893 (1)",
+            "tab": "Efficiency",
+            "score": 13.89268838323639
+          },
+          "IFEval - # eval": {
+            "description": "min=541, mean=541, max=541, sum=541 (1)",
+            "tab": "General information",
+            "score": 541.0
+          },
+          "IFEval - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "IFEval - # prompt tokens": {
+            "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)",
+            "tab": "General information",
+            "score": 46.491682070240294
+          },
+          "IFEval - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.807,
+        "details": {
+          "description": "min=0.807, mean=0.807, max=0.807, sum=0.807 (1)",
+          "tab": "Accuracy",
+          "WildBench - Observed inference time (s)": {
+            "description": "min=23.095, mean=23.095, max=23.095, sum=23.095 (1)",
+            "tab": "Efficiency",
+            "score": 23.095464605808257
+          },
+          "WildBench - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "WildBench - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # prompt tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "WildBench - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {
+          "subset": "v2"
+        }
+      }
+    },
+    {
+      "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
+      "metric_config": {
+        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0
+      },
+      "score_details": {
+        "score": 0.467,
+        "details": {
+          "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)",
+          "tab": "Accuracy",
+          "Omni-MATH - Observed inference time (s)": {
+            "description": "min=40.912, mean=40.912, max=40.912, sum=40.912 (1)",
+            "tab": "Efficiency",
+            "score": 40.91244238162041
+          },
+          "Omni-MATH - # eval": {
+            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
+            "tab": "General information",
+            "score": 1000.0
+          },
+          "Omni-MATH - # train": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - truncated": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          },
+          "Omni-MATH - # prompt tokens": {
+            "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)",
+            "tab": "General information",
+            "score": 111.6
+          },
+          "Omni-MATH - # output tokens": {
+            "description": "min=0, mean=0, max=0, sum=0 (1)",
+            "tab": "General information",
+            "score": 0.0
+          }
+        }
+      },
+      "generation_config": {
+        "additional_details": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/helm_capabilities/writer/palmyra-fin/39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json b/data/helm_capabilities/writer/palmyra-fin/442aed0d-95c3-4436-ad63-b7b1e93307f4.json
similarity index 80%
rename from data/helm_capabilities/writer/palmyra-fin/39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json
rename to data/helm_capabilities/writer/palmyra-fin/442aed0d-95c3-4436-ad63-b7b1e93307f4.json
index cf2b63d2e..da11997be 100644
--- a/data/helm_capabilities/writer/palmyra-fin/39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json
+++ b/data/helm_capabilities/writer/palmyra-fin/442aed0d-95c3-4436-ad63-b7b1e93307f4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-fin/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/writer_palmyra-fin/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.577,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/writer/palmyra-med/1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json b/data/helm_capabilities/writer/palmyra-med/7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json
similarity index 80%
rename from data/helm_capabilities/writer/palmyra-med/1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json
rename to data/helm_capabilities/writer/palmyra-med/7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json
index 0d8108574..78088b82e 100644
--- a/data/helm_capabilities/writer/palmyra-med/1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json
+++ b/data/helm_capabilities/writer/palmyra-med/7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-med/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/writer_palmyra-med/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.476,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/writer/palmyra-x-004/01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json b/data/helm_capabilities/writer/palmyra-x-004/bc2c91e0-6afd-4e44-b665-d5c7558f8981.json
similarity index 80%
rename from data/helm_capabilities/writer/palmyra-x-004/01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json
rename to data/helm_capabilities/writer/palmyra-x-004/bc2c91e0-6afd-4e44-b665-d5c7558f8981.json
index 8b3240898..b630b3cd0 100644
--- a/data/helm_capabilities/writer/palmyra-x-004/01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json
+++ b/data/helm_capabilities/writer/palmyra-x-004/bc2c91e0-6afd-4e44-b665-d5c7558f8981.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.609,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/writer/palmyra-x5/c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json b/data/helm_capabilities/writer/palmyra-x5/a74b74f7-ccce-4341-a122-26728cc6bece.json
similarity index 80%
rename from data/helm_capabilities/writer/palmyra-x5/c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json
rename to data/helm_capabilities/writer/palmyra-x5/a74b74f7-ccce-4341-a122-26728cc6bece.json
index ba834a256..c212295f1 100644
--- a/data/helm_capabilities/writer/palmyra-x5/c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json
+++ b/data/helm_capabilities/writer/palmyra-x5/a74b74f7-ccce-4341-a122-26728cc6bece.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-x5/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/writer_palmyra-x5/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.696,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/xai/grok-3-beta/24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json b/data/helm_capabilities/xai/grok-3-beta/87811b75-afe8-413b-949d-7fd1f582a2e8.json
similarity index 80%
rename from data/helm_capabilities/xai/grok-3-beta/24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json
rename to data/helm_capabilities/xai/grok-3-beta/87811b75-afe8-413b-949d-7fd1f582a2e8.json
index 7640dfe10..34f4be43b 100644
--- a/data/helm_capabilities/xai/grok-3-beta/24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json
+++ b/data/helm_capabilities/xai/grok-3-beta/87811b75-afe8-413b-949d-7fd1f582a2e8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/xai_grok-3-beta/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/xai_grok-3-beta/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.727,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/xai/grok-3-mini-beta/b028eaaf-bc4d-4918-8464-f8c4b0c74973.json b/data/helm_capabilities/xai/grok-3-mini-beta/ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json
similarity index 80%
rename from data/helm_capabilities/xai/grok-3-mini-beta/b028eaaf-bc4d-4918-8464-f8c4b0c74973.json
rename to data/helm_capabilities/xai/grok-3-mini-beta/ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json
index 8570e4d80..31dab978e 100644
--- a/data/helm_capabilities/xai/grok-3-mini-beta/b028eaaf-bc4d-4918-8464-f8c4b0c74973.json
+++ b/data/helm_capabilities/xai/grok-3-mini-beta/ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.679,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/xai/grok-4-0709/c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json b/data/helm_capabilities/xai/grok-4-0709/924080a0-c530-4e6d-b1a4-107de3bd7183.json
similarity index 80%
rename from data/helm_capabilities/xai/grok-4-0709/c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json
rename to data/helm_capabilities/xai/grok-4-0709/924080a0-c530-4e6d-b1a4-107de3bd7183.json
index b9fbeb3c0..872c4f1f5 100644
--- a/data/helm_capabilities/xai/grok-4-0709/c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json
+++ b/data/helm_capabilities/xai/grok-4-0709/924080a0-c530-4e6d-b1a4-107de3bd7183.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/xai_grok-4-0709/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/xai_grok-4-0709/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Grok 4 (0709)",
+    "name": "Grok 4 0709",
     "id": "xai/grok-4-0709",
     "developer": "xai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.785,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/7b231b0d-89b8-4a0a-825e-ccfea212f565.json b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/be23c720-a99a-4945-bc0b-ddc27c8eec39.json
similarity index 81%
rename from data/helm_capabilities/zai-org/glm-4.5-air-fp8/7b231b0d-89b8-4a0a-825e-ccfea212f565.json
rename to data/helm_capabilities/zai-org/glm-4.5-air-fp8/be23c720-a99a-4945-bc0b-ddc27c8eec39.json
index b4d4807f0..4ead0f554 100644
--- a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/7b231b0d-89b8-4a0a-825e-ccfea212f565.json
+++ b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/be23c720-a99a-4945-bc0b-ddc27c8eec39.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1767657480.2939079",
-  "retrieved_timestamp": "1767657480.2939079",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1770830201.581632",
+  "retrieved_timestamp": "1770830201.581632",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean score",
+      "source_data": {
+        "dataset_name": "helm_capabilities",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The mean of the scores from all columns.",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.67,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean score - Efficiency": {
             "description": null,
@@ -39,10 +42,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU-Pro - COT correct",
+      "source_data": {
+        "dataset_name": "MMLU-Pro",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -88,13 +100,22 @@
         }
       },
       "generation_config": {
-        "subset": "all",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "all",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "GPQA - COT correct",
+      "source_data": {
+        "dataset_name": "GPQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
         "lower_is_better": false,
@@ -140,13 +161,22 @@
         }
       },
       "generation_config": {
-        "subset": "gpqa_main",
-        "use_chain_of_thought": "true",
-        "use_few_shot": "false"
+        "additional_details": {
+          "subset": "gpqa_main",
+          "use_chain_of_thought": "true",
+          "use_few_shot": "false"
+        }
       }
     },
     {
       "evaluation_name": "IFEval - IFEval Strict Acc",
+      "source_data": {
+        "dataset_name": "IFEval",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
         "lower_is_better": false,
@@ -191,10 +221,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WildBench - WB Score",
+      "source_data": {
+        "dataset_name": "WildBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
         "lower_is_better": false,
@@ -240,11 +279,20 @@
         }
       },
       "generation_config": {
-        "subset": "v2"
+        "additional_details": {
+          "subset": "v2"
+        }
       }
     },
     {
       "evaluation_name": "Omni-MATH - Acc",
+      "source_data": {
+        "dataset_name": "Omni-MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
         "lower_is_better": false,
@@ -289,7 +337,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json
similarity index 92%
rename from data/helm_classic/anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json
rename to data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json
index fac51642a..58e2410df 100644
--- a/data/helm_classic/anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json
+++ b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/anthropic_Anthropic-LM-v4-s3-52B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/Anthropic-LM-v4-s3-52B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Anthropic-LM v4-s3 52B",
-    "id": "anthropic/Anthropic-LM-v4-s3-52B",
-    "developer": "anthropic",
+    "id": "Anthropic-LM-v4-s3-52B",
+    "developer": "unknown",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.78,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/J1-Grande-v1-17B/09f5c502-2950-48fb-b25f-b562eeee26c8.json b/data/helm_classic/ai21/J1-Grande-v1-17B/c12a8494-bafc-4097-874a-7c00636e96f8.json
similarity index 92%
rename from data/helm_classic/ai21/J1-Grande-v1-17B/09f5c502-2950-48fb-b25f-b562eeee26c8.json
rename to data/helm_classic/ai21/J1-Grande-v1-17B/c12a8494-bafc-4097-874a-7c00636e96f8.json
index 05d951313..4805e7ac8 100644
--- a/data/helm_classic/ai21/J1-Grande-v1-17B/09f5c502-2950-48fb-b25f-b562eeee26c8.json
+++ b/data/helm_classic/ai21/J1-Grande-v1-17B/c12a8494-bafc-4097-874a-7c00636e96f8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.433,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/3d13f9ba-b18e-4b52-b28d-9aed0621903d.json b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json
similarity index 92%
rename from data/helm_classic/ai21/J1-Grande-v2-beta-17B/3d13f9ba-b18e-4b52-b28d-9aed0621903d.json
rename to data/helm_classic/ai21/J1-Grande-v2-beta-17B/4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json
index cc58c06c0..e47585440 100644
--- a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/3d13f9ba-b18e-4b52-b28d-9aed0621903d.json
+++ b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.706,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/J1-Jumbo-v1-178B/3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json b/data/helm_classic/ai21/J1-Jumbo-v1-178B/19f61327-fcc3-408f-9254-2d6a2aadcd4e.json
similarity index 92%
rename from data/helm_classic/ai21/J1-Jumbo-v1-178B/3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json
rename to data/helm_classic/ai21/J1-Jumbo-v1-178B/19f61327-fcc3-408f-9254-2d6a2aadcd4e.json
index 0be03d012..bfd78fa42 100644
--- a/data/helm_classic/ai21/J1-Jumbo-v1-178B/3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json
+++ b/data/helm_classic/ai21/J1-Jumbo-v1-178B/19f61327-fcc3-408f-9254-2d6a2aadcd4e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.517,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/J1-Large-v1-7.5B/1ab7f23a-7527-4188-9141-852f5123eb19.json b/data/helm_classic/ai21/J1-Large-v1-7.5B/ccc17d56-bd26-409c-ac3f-262eaba9ce21.json
similarity index 92%
rename from data/helm_classic/ai21/J1-Large-v1-7.5B/1ab7f23a-7527-4188-9141-852f5123eb19.json
rename to data/helm_classic/ai21/J1-Large-v1-7.5B/ccc17d56-bd26-409c-ac3f-262eaba9ce21.json
index 3239df52d..a1c2d2860 100644
--- a/data/helm_classic/ai21/J1-Large-v1-7.5B/1ab7f23a-7527-4188-9141-852f5123eb19.json
+++ b/data/helm_classic/ai21/J1-Large-v1-7.5B/ccc17d56-bd26-409c-ac3f-262eaba9ce21.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.285,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/Jurassic-2-Grande-17B/f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json b/data/helm_classic/ai21/Jurassic-2-Grande-17B/f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json
similarity index 92%
rename from data/helm_classic/ai21/Jurassic-2-Grande-17B/f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json
rename to data/helm_classic/ai21/Jurassic-2-Grande-17B/f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json
index 71ff2dc38..e9db23ac9 100644
--- a/data/helm_classic/ai21/Jurassic-2-Grande-17B/f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json
+++ b/data/helm_classic/ai21/Jurassic-2-Grande-17B/f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.743,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json
similarity index 92%
rename from data/helm_classic/ai21/Jurassic-2-Jumbo-178B/ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json
rename to data/helm_classic/ai21/Jurassic-2-Jumbo-178B/9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json
index ab1f54c90..38cd07e2a 100644
--- a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json
+++ b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.824,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/67114722-a441-478b-a324-2c32be7e06a7.json b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/f25c142c-8730-4241-a649-01d076e1f28d.json
similarity index 91%
rename from data/helm_classic/ai21/Jurassic-2-Large-7.5B/67114722-a441-478b-a324-2c32be7e06a7.json
rename to data/helm_classic/ai21/Jurassic-2-Large-7.5B/f25c142c-8730-4241-a649-01d076e1f28d.json
index 14e3a243d..589346e15 100644
--- a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/67114722-a441-478b-a324-2c32be7e06a7.json
+++ b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/f25c142c-8730-4241-a649-01d076e1f28d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.553,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/aleph-alpha/Luminous-Base-13B/07fa437f-398d-48ab-a74d-b8c59caf3add.json b/data/helm_classic/aleph-alpha/Luminous-Base-13B/ab34f23e-36db-40c0-9681-f30b00692f98.json
similarity index 92%
rename from data/helm_classic/aleph-alpha/Luminous-Base-13B/07fa437f-398d-48ab-a74d-b8c59caf3add.json
rename to data/helm_classic/aleph-alpha/Luminous-Base-13B/ab34f23e-36db-40c0-9681-f30b00692f98.json
index 9fccefc67..371a206a5 100644
--- a/data/helm_classic/aleph-alpha/Luminous-Base-13B/07fa437f-398d-48ab-a74d-b8c59caf3add.json
+++ b/data/helm_classic/aleph-alpha/Luminous-Base-13B/ab34f23e-36db-40c0-9681-f30b00692f98.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.315,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/7492964a-2c16-4261-aaca-dbcd4f3be7c3.json b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/67281534-a03d-49d8-a586-25cb1a03134e.json
similarity index 92%
rename from data/helm_classic/aleph-alpha/Luminous-Extended-30B/7492964a-2c16-4261-aaca-dbcd4f3be7c3.json
rename to data/helm_classic/aleph-alpha/Luminous-Extended-30B/67281534-a03d-49d8-a586-25cb1a03134e.json
index 9f9536338..715673aae 100644
--- a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/7492964a-2c16-4261-aaca-dbcd4f3be7c3.json
+++ b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/67281534-a03d-49d8-a586-25cb1a03134e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.485,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/b5dace02-416d-4b90-90e1-562b22820784.json b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json
similarity index 92%
rename from data/helm_classic/aleph-alpha/Luminous-Supreme-70B/b5dace02-416d-4b90-90e1-562b22820784.json
rename to data/helm_classic/aleph-alpha/Luminous-Supreme-70B/3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json
index ed0fa9dcd..5f8731441 100644
--- a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/b5dace02-416d-4b90-90e1-562b22820784.json
+++ b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.662,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/bigscience/BLOOM-176B/0e6cd483-dff8-4fba-9239-82cb0fe34d42.json b/data/helm_classic/bigscience/BLOOM-176B/04ce2ba4-c382-4658-ba06-1def9499a243.json
similarity index 92%
rename from data/helm_classic/bigscience/BLOOM-176B/0e6cd483-dff8-4fba-9239-82cb0fe34d42.json
rename to data/helm_classic/bigscience/BLOOM-176B/04ce2ba4-c382-4658-ba06-1def9499a243.json
index 19831593f..04305416d 100644
--- a/data/helm_classic/bigscience/BLOOM-176B/0e6cd483-dff8-4fba-9239-82cb0fe34d42.json
+++ b/data/helm_classic/bigscience/BLOOM-176B/04ce2ba4-c382-4658-ba06-1def9499a243.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.446,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/bigscience/T0pp-11B/9ae59291-604f-4527-812a-a3150a1098f2.json b/data/helm_classic/bigscience/T0pp-11B/3a546396-d031-4958-8410-00e0d3406089.json
similarity index 93%
rename from data/helm_classic/bigscience/T0pp-11B/9ae59291-604f-4527-812a-a3150a1098f2.json
rename to data/helm_classic/bigscience/T0pp-11B/3a546396-d031-4958-8410-00e0d3406089.json
index af37640ca..1bbeba7ff 100644
--- a/data/helm_classic/bigscience/T0pp-11B/9ae59291-604f-4527-812a-a3150a1098f2.json
+++ b/data/helm_classic/bigscience/T0pp-11B/3a546396-d031-4958-8410-00e0d3406089.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/bigscience_T0pp-11B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/bigscience_T0pp-11B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.197,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/52026df3-2452-4fd2-a10b-73a2bfc5397e.json b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/e7b99aa6-08e8-4224-a805-16586eb44325.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-Command-beta-52.4B/52026df3-2452-4fd2-a10b-73a2bfc5397e.json
rename to data/helm_classic/cohere/Cohere-Command-beta-52.4B/e7b99aa6-08e8-4224-a805-16586eb44325.json
index 5eb323191..fadfb62da 100644
--- a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/52026df3-2452-4fd2-a10b-73a2bfc5397e.json
+++ b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/e7b99aa6-08e8-4224-a805-16586eb44325.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.874,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/19b97859-5af3-4883-a878-93d026c29d87.json b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/43a3fe19-929a-463d-a0ed-791dad765188.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-Command-beta-6.1B/19b97859-5af3-4883-a878-93d026c29d87.json
rename to data/helm_classic/cohere/Cohere-Command-beta-6.1B/43a3fe19-929a-463d-a0ed-791dad765188.json
index d20d6332d..b1c061a45 100644
--- a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/19b97859-5af3-4883-a878-93d026c29d87.json
+++ b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/43a3fe19-929a-463d-a0ed-791dad765188.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.675,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/37af5185-3599-49f5-9637-55d41bc6ae81.json b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/75468958-b75b-41fe-9813-070b793e86d9.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-large-v20220720-13.1B/37af5185-3599-49f5-9637-55d41bc6ae81.json
rename to data/helm_classic/cohere/Cohere-large-v20220720-13.1B/75468958-b75b-41fe-9813-070b793e86d9.json
index 54182b504..bd838c107 100644
--- a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/37af5185-3599-49f5-9637-55d41bc6ae81.json
+++ b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/75468958-b75b-41fe-9813-070b793e86d9.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.372,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/cf32b49f-7cf8-43a3-9e28-ade7446272ab.json b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/cf32b49f-7cf8-43a3-9e28-ade7446272ab.json
rename to data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json
index ecba92b3a..3e7a0f6fa 100644
--- a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/cf32b49f-7cf8-43a3-9e28-ade7446272ab.json
+++ b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.23,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/ad9bd354-01d9-4a21-a299-a53190e1eb7e.json b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/3c9c425a-ce4a-4958-9744-7f9490ed5729.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/ad9bd354-01d9-4a21-a299-a53190e1eb7e.json
rename to data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/3c9c425a-ce4a-4958-9744-7f9490ed5729.json
index 0b33b0763..745f99da6 100644
--- a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/ad9bd354-01d9-4a21-a299-a53190e1eb7e.json
+++ b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/3c9c425a-ce4a-4958-9744-7f9490ed5729.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.312,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-small-v20220720-410M/12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json b/data/helm_classic/cohere/Cohere-small-v20220720-410M/5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-small-v20220720-410M/12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json
rename to data/helm_classic/cohere/Cohere-small-v20220720-410M/5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json
index 4abc0c79b..478f77b1d 100644
--- a/data/helm_classic/cohere/Cohere-small-v20220720-410M/12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json
+++ b/data/helm_classic/cohere/Cohere-small-v20220720-410M/5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.109,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json
rename to data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json
index 6c362be4c..2039d0727 100644
--- a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json
+++ b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.56,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/f8044c74-3f1c-4562-a21c-e448061b2077.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json
rename to data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/f8044c74-3f1c-4562-a21c-e448061b2077.json
index f92b78094..216532187 100644
--- a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json
+++ b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/f8044c74-3f1c-4562-a21c-e448061b2077.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.664,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json b/data/helm_classic/eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json
similarity index 91%
rename from data/helm_classic/eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json
rename to data/helm_classic/eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json
index f6f9d6eae..8f5d16956 100644
--- a/data/helm_classic/eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json
+++ b/data/helm_classic/eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/eleuther-ai_Pythia-12B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/eleutherai_Pythia-12B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Pythia 12B",
-    "id": "eleuther-ai/Pythia-12B",
-    "developer": "eleuther-ai",
+    "id": "eleutherai/Pythia-12B",
+    "developer": "eleutherai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.257,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json b/data/helm_classic/eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json
similarity index 91%
rename from data/helm_classic/eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json
rename to data/helm_classic/eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json
index 2b488fa6f..20ca16498 100644
--- a/data/helm_classic/eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json
+++ b/data/helm_classic/eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/eleuther-ai_Pythia-6.9B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/eleutherai_Pythia-6.9B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Pythia 6.9B",
-    "id": "eleuther-ai/Pythia-6.9B",
-    "developer": "eleuther-ai",
+    "id": "eleutherai/Pythia-6.9B",
+    "developer": "eleutherai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.196,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json b/data/helm_classic/google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json
similarity index 91%
rename from data/helm_classic/writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json
rename to data/helm_classic/google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json
index 725954e16..d36f642d7 100644
--- a/data/helm_classic/writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json
+++ b/data/helm_classic/google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/writer_Palmyra-X-43B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/google_Palmyra-X-43B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Palmyra X 43B",
-    "id": "writer/Palmyra-X-43B",
-    "developer": "writer",
+    "id": "google/Palmyra-X-43B",
+    "developer": "google",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.732,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/google/T5-11B/df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json b/data/helm_classic/google/T5-11B/52db5c6d-b54e-401a-880d-8ab41a394bc4.json
similarity index 92%
rename from data/helm_classic/google/T5-11B/df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json
rename to data/helm_classic/google/T5-11B/52db5c6d-b54e-401a-880d-8ab41a394bc4.json
index 9bacd9bf9..0f7601506 100644
--- a/data/helm_classic/google/T5-11B/df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json
+++ b/data/helm_classic/google/T5-11B/52db5c6d-b54e-401a-880d-8ab41a394bc4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/google_T5-11B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/google_T5-11B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.131,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/google/UL2-20B/ac49ac68-0d7f-4972-bb99-0332b14df2d5.json b/data/helm_classic/google/UL2-20B/68becad6-9455-4d3d-8d68-d1b4448598a1.json
similarity index 92%
rename from data/helm_classic/google/UL2-20B/ac49ac68-0d7f-4972-bb99-0332b14df2d5.json
rename to data/helm_classic/google/UL2-20B/68becad6-9455-4d3d-8d68-d1b4448598a1.json
index c9bf42a12..70193f3b0 100644
--- a/data/helm_classic/google/UL2-20B/ac49ac68-0d7f-4972-bb99-0332b14df2d5.json
+++ b/data/helm_classic/google/UL2-20B/68becad6-9455-4d3d-8d68-d1b4448598a1.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/google_UL2-20B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/google_UL2-20B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.167,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-13B/39f4648c-6635-4ffa-86f5-040e69f3e054.json b/data/helm_classic/lmsys/Vicuna-v1.3-13B/519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json
similarity index 91%
rename from data/helm_classic/lmsys/Vicuna-v1.3-13B/39f4648c-6635-4ffa-86f5-040e69f3e054.json
rename to data/helm_classic/lmsys/Vicuna-v1.3-13B/519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json
index 65a179431..385ac9b25 100644
--- a/data/helm_classic/lmsys/Vicuna-v1.3-13B/39f4648c-6635-4ffa-86f5-040e69f3e054.json
+++ b/data/helm_classic/lmsys/Vicuna-v1.3-13B/519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.706,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-7B/4ef38a9d-283c-4549-8de3-d04ce7f62542.json b/data/helm_classic/lmsys/Vicuna-v1.3-7B/972bc5db-f536-42f9-aa51-83cc2f59b76a.json
similarity index 91%
rename from data/helm_classic/lmsys/Vicuna-v1.3-7B/4ef38a9d-283c-4549-8de3-d04ce7f62542.json
rename to data/helm_classic/lmsys/Vicuna-v1.3-7B/972bc5db-f536-42f9-aa51-83cc2f59b76a.json
index bf5b7f8ab..3de9b1fd2 100644
--- a/data/helm_classic/lmsys/Vicuna-v1.3-7B/4ef38a9d-283c-4549-8de3-d04ce7f62542.json
+++ b/data/helm_classic/lmsys/Vicuna-v1.3-7B/972bc5db-f536-42f9-aa51-83cc2f59b76a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.625,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/LLaMA-13B/81eee874-47be-4a55-af47-5b3e1bcbd361.json b/data/helm_classic/meta/LLaMA-13B/b2220101-56e0-49d9-a3d1-d3bec769ab97.json
similarity index 91%
rename from data/helm_classic/meta/LLaMA-13B/81eee874-47be-4a55-af47-5b3e1bcbd361.json
rename to data/helm_classic/meta/LLaMA-13B/b2220101-56e0-49d9-a3d1-d3bec769ab97.json
index b007605c7..ac2da41ef 100644
--- a/data/helm_classic/meta/LLaMA-13B/81eee874-47be-4a55-af47-5b3e1bcbd361.json
+++ b/data/helm_classic/meta/LLaMA-13B/b2220101-56e0-49d9-a3d1-d3bec769ab97.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-13B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_LLaMA-13B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.595,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/LLaMA-30B/2a23b568-daed-4783-9c51-5218216f5f19.json b/data/helm_classic/meta/LLaMA-30B/96907b25-05c3-441b-afc4-69274c20bfc3.json
similarity index 91%
rename from data/helm_classic/meta/LLaMA-30B/2a23b568-daed-4783-9c51-5218216f5f19.json
rename to data/helm_classic/meta/LLaMA-30B/96907b25-05c3-441b-afc4-69274c20bfc3.json
index 8e6647f52..1b33fd761 100644
--- a/data/helm_classic/meta/LLaMA-30B/2a23b568-daed-4783-9c51-5218216f5f19.json
+++ b/data/helm_classic/meta/LLaMA-30B/96907b25-05c3-441b-afc4-69274c20bfc3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-30B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_LLaMA-30B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.781,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/LLaMA-65B/584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json b/data/helm_classic/meta/LLaMA-65B/66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json
similarity index 91%
rename from data/helm_classic/meta/LLaMA-65B/584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json
rename to data/helm_classic/meta/LLaMA-65B/66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json
index 1dbaa6d85..a7d6351b2 100644
--- a/data/helm_classic/meta/LLaMA-65B/584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json
+++ b/data/helm_classic/meta/LLaMA-65B/66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-65B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_LLaMA-65B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.908,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/LLaMA-7B/6a2445e0-75d4-4434-aabd-645fd445a920.json b/data/helm_classic/meta/LLaMA-7B/70e9e156-6807-489b-b77a-367236614826.json
similarity index 91%
rename from data/helm_classic/meta/LLaMA-7B/6a2445e0-75d4-4434-aabd-645fd445a920.json
rename to data/helm_classic/meta/LLaMA-7B/70e9e156-6807-489b-b77a-367236614826.json
index 4a772fb18..79b00a818 100644
--- a/data/helm_classic/meta/LLaMA-7B/6a2445e0-75d4-4434-aabd-645fd445a920.json
+++ b/data/helm_classic/meta/LLaMA-7B/70e9e156-6807-489b-b77a-367236614826.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_LLaMA-7B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.533,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/Llama-2-13B/f5d57067-8a00-490f-b1bf-30afd0b0f126.json b/data/helm_classic/meta/Llama-2-13B/e90cfb46-1173-4d22-9329-9bf57cdd5241.json
similarity index 91%
rename from data/helm_classic/meta/Llama-2-13B/f5d57067-8a00-490f-b1bf-30afd0b0f126.json
rename to data/helm_classic/meta/Llama-2-13B/e90cfb46-1173-4d22-9329-9bf57cdd5241.json
index de40c742e..170095f5b 100644
--- a/data/helm_classic/meta/Llama-2-13B/f5d57067-8a00-490f-b1bf-30afd0b0f126.json
+++ b/data/helm_classic/meta/Llama-2-13B/e90cfb46-1173-4d22-9329-9bf57cdd5241.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_Llama-2-13B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_Llama-2-13B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.823,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/Llama-2-70B/cb8802af-613e-42a1-b025-31532996eb10.json b/data/helm_classic/meta/Llama-2-70B/baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json
similarity index 91%
rename from data/helm_classic/meta/Llama-2-70B/cb8802af-613e-42a1-b025-31532996eb10.json
rename to data/helm_classic/meta/Llama-2-70B/baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json
index 77f6938f9..01e4b1b14 100644
--- a/data/helm_classic/meta/Llama-2-70B/cb8802af-613e-42a1-b025-31532996eb10.json
+++ b/data/helm_classic/meta/Llama-2-70B/baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_Llama-2-70B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_Llama-2-70B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.944,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/Llama-2-7B/ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json b/data/helm_classic/meta/Llama-2-7B/7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json
similarity index 91%
rename from data/helm_classic/meta/Llama-2-7B/ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json
rename to data/helm_classic/meta/Llama-2-7B/7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json
index 3b3b39208..40b71e7de 100644
--- a/data/helm_classic/meta/Llama-2-7B/ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json
+++ b/data/helm_classic/meta/Llama-2-7B/7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_Llama-2-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_Llama-2-7B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.607,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/OPT-175B/75a5843f-73a4-4ff3-94b5-184152ff703c.json b/data/helm_classic/meta/OPT-175B/ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json
similarity index 92%
rename from data/helm_classic/meta/OPT-175B/75a5843f-73a4-4ff3-94b5-184152ff703c.json
rename to data/helm_classic/meta/OPT-175B/ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json
index 0da99434e..f864b9222 100644
--- a/data/helm_classic/meta/OPT-175B/75a5843f-73a4-4ff3-94b5-184152ff703c.json
+++ b/data/helm_classic/meta/OPT-175B/ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_OPT-175B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_OPT-175B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.609,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/meta/OPT-66B/83d19197-aebd-43fa-a7ed-20818a9e5d8e.json b/data/helm_classic/meta/OPT-66B/26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json
similarity index 92%
rename from data/helm_classic/meta/OPT-66B/83d19197-aebd-43fa-a7ed-20818a9e5d8e.json
rename to data/helm_classic/meta/OPT-66B/26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json
index 929a020d2..ba62ce7d1 100644
--- a/data/helm_classic/meta/OPT-66B/83d19197-aebd-43fa-a7ed-20818a9e5d8e.json
+++ b/data/helm_classic/meta/OPT-66B/26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/meta_OPT-66B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/meta_OPT-66B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.448,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/microsoft/TNLG-v2-530B/dd121d07-5198-4ac6-81d6-df38485bff25.json b/data/helm_classic/microsoft/TNLG-v2-530B/ecd21c26-cdc4-43b1-b933-4d970df9413a.json
similarity index 92%
rename from data/helm_classic/microsoft/TNLG-v2-530B/dd121d07-5198-4ac6-81d6-df38485bff25.json
rename to data/helm_classic/microsoft/TNLG-v2-530B/ecd21c26-cdc4-43b1-b933-4d970df9413a.json
index 786e640a5..ce5dcad88 100644
--- a/data/helm_classic/microsoft/TNLG-v2-530B/dd121d07-5198-4ac6-81d6-df38485bff25.json
+++ b/data/helm_classic/microsoft/TNLG-v2-530B/ecd21c26-cdc4-43b1-b933-4d970df9413a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.787,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/microsoft/TNLG-v2-6.7B/f23680f4-8b5a-4baf-9e8d-74f0f4847183.json b/data/helm_classic/microsoft/TNLG-v2-6.7B/9d4350eb-cdf0-432f-b3b0-45f4832ca950.json
similarity index 92%
rename from data/helm_classic/microsoft/TNLG-v2-6.7B/f23680f4-8b5a-4baf-9e8d-74f0f4847183.json
rename to data/helm_classic/microsoft/TNLG-v2-6.7B/9d4350eb-cdf0-432f-b3b0-45f4832ca950.json
index ade6f8a0a..cfa4e8177 100644
--- a/data/helm_classic/microsoft/TNLG-v2-6.7B/f23680f4-8b5a-4baf-9e8d-74f0f4847183.json
+++ b/data/helm_classic/microsoft/TNLG-v2-6.7B/9d4350eb-cdf0-432f-b3b0-45f4832ca950.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.309,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json b/data/helm_classic/mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json
similarity index 91%
rename from data/helm_classic/mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json
rename to data/helm_classic/mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json
index a4f716c06..738857e58 100644
--- a/data/helm_classic/mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json
+++ b/data/helm_classic/mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/mistral-ai_Mistral-v0.1-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/mistralai_Mistral-v0.1-7B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Mistral v0.1 7B",
-    "id": "mistral-ai/Mistral-v0.1-7B",
-    "developer": "mistral-ai",
+    "id": "mistralai/Mistral-v0.1-7B",
+    "developer": "mistralai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.884,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/mosaicml/MPT-30B/cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json b/data/helm_classic/mosaicml/MPT-30B/b277c87e-54b5-466f-97d7-35db4cd7b985.json
similarity index 91%
rename from data/helm_classic/mosaicml/MPT-30B/cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json
rename to data/helm_classic/mosaicml/MPT-30B/b277c87e-54b5-466f-97d7-35db4cd7b985.json
index bf414b629..2580877d4 100644
--- a/data/helm_classic/mosaicml/MPT-30B/cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json
+++ b/data/helm_classic/mosaicml/MPT-30B/b277c87e-54b5-466f-97d7-35db4cd7b985.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/mosaicml_MPT-30B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/mosaicml_MPT-30B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.714,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/mosaicml/MPT-Instruct-30B/182a7373-7ea3-4f2b-b730-af16e20b9fa7.json b/data/helm_classic/mosaicml/MPT-Instruct-30B/270df23b-9e58-4259-a8ed-0d25b9c80b2a.json
similarity index 91%
rename from data/helm_classic/mosaicml/MPT-Instruct-30B/182a7373-7ea3-4f2b-b730-af16e20b9fa7.json
rename to data/helm_classic/mosaicml/MPT-Instruct-30B/270df23b-9e58-4259-a8ed-0d25b9c80b2a.json
index dd4c71e77..a7cbf9856 100644
--- a/data/helm_classic/mosaicml/MPT-Instruct-30B/182a7373-7ea3-4f2b-b730-af16e20b9fa7.json
+++ b/data/helm_classic/mosaicml/MPT-Instruct-30B/270df23b-9e58-4259-a8ed-0d25b9c80b2a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.716,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json b/data/helm_classic/openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json
similarity index 92%
rename from data/helm_classic/eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json
rename to data/helm_classic/openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json
index 64c16a070..c135cdcfb 100644
--- a/data/helm_classic/eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json
+++ b/data/helm_classic/openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/eleuther-ai_GPT-J-6B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_GPT-J-6B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "GPT-J 6B",
-    "id": "eleuther-ai/GPT-J-6B",
-    "developer": "eleuther-ai",
+    "id": "openai/GPT-J-6B",
+    "developer": "openai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.273,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json b/data/helm_classic/openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json
similarity index 92%
rename from data/helm_classic/eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json
rename to data/helm_classic/openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json
index b26d9ed28..d4e4c3e18 100644
--- a/data/helm_classic/eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json
+++ b/data/helm_classic/openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/eleuther-ai_GPT-NeoX-20B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_GPT-NeoX-20B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "GPT-NeoX 20B",
-    "id": "eleuther-ai/GPT-NeoX-20B",
-    "developer": "eleuther-ai",
+    "id": "openai/GPT-NeoX-20B",
+    "developer": "openai",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.351,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/ada-350M/f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json b/data/helm_classic/openai/ada-350M/e6ea5f7e-0533-4a99-8638-1cc10c454238.json
similarity index 94%
rename from data/helm_classic/openai/ada-350M/f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json
rename to data/helm_classic/openai/ada-350M/e6ea5f7e-0533-4a99-8638-1cc10c454238.json
index ee84122f5..ae351a8ab 100644
--- a/data/helm_classic/openai/ada-350M/f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json
+++ b/data/helm_classic/openai/ada-350M/e6ea5f7e-0533-4a99-8638-1cc10c454238.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_ada-350M/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_ada-350M/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.108,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/babbage-1.3B/1c4a54f3-4599-441b-8f30-5e275a0597a7.json b/data/helm_classic/openai/babbage-1.3B/83c924fe-6318-4bad-adb0-8a81e5e28ee0.json
similarity index 94%
rename from data/helm_classic/openai/babbage-1.3B/1c4a54f3-4599-441b-8f30-5e275a0597a7.json
rename to data/helm_classic/openai/babbage-1.3B/83c924fe-6318-4bad-adb0-8a81e5e28ee0.json
index 3a55a8db1..4f76e1f1b 100644
--- a/data/helm_classic/openai/babbage-1.3B/1c4a54f3-4599-441b-8f30-5e275a0597a7.json
+++ b/data/helm_classic/openai/babbage-1.3B/83c924fe-6318-4bad-adb0-8a81e5e28ee0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_babbage-1.3B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_babbage-1.3B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.114,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/curie-6.7B/dbefbdbd-b64e-40e9-b632-0dcae3f33913.json b/data/helm_classic/openai/curie-6.7B/82e2c0e3-66f2-431f-b4b8-d2495970d998.json
similarity index 94%
rename from data/helm_classic/openai/curie-6.7B/dbefbdbd-b64e-40e9-b632-0dcae3f33913.json
rename to data/helm_classic/openai/curie-6.7B/82e2c0e3-66f2-431f-b4b8-d2495970d998.json
index d7959e7bb..84c344282 100644
--- a/data/helm_classic/openai/curie-6.7B/dbefbdbd-b64e-40e9-b632-0dcae3f33913.json
+++ b/data/helm_classic/openai/curie-6.7B/82e2c0e3-66f2-431f-b4b8-d2495970d998.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_curie-6.7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_curie-6.7B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.247,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/davinci-175B/f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json b/data/helm_classic/openai/davinci-175B/6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json
similarity index 94%
rename from data/helm_classic/openai/davinci-175B/f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json
rename to data/helm_classic/openai/davinci-175B/6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json
index 6b30fefef..fb0de7bd8 100644
--- a/data/helm_classic/openai/davinci-175B/f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json
+++ b/data/helm_classic/openai/davinci-175B/6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_davinci-175B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_davinci-175B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.538,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0301/2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json b/data/helm_classic/openai/gpt-3.5-turbo-0301/e18fbf9e-677c-49fb-ab76-475e8f605f01.json
similarity index 91%
rename from data/helm_classic/openai/gpt-3.5-turbo-0301/2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json
rename to data/helm_classic/openai/gpt-3.5-turbo-0301/e18fbf9e-677c-49fb-ab76-475e8f605f01.json
index 435cb040d..582bc2e6a 100644
--- a/data/helm_classic/openai/gpt-3.5-turbo-0301/2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json
+++ b/data/helm_classic/openai/gpt-3.5-turbo-0301/e18fbf9e-677c-49fb-ab76-475e8f605f01.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.76,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0613/826d8e72-7332-48b1-af41-537e505c9e11.json b/data/helm_classic/openai/gpt-3.5-turbo-0613/039af363-0c5c-4e36-8396-cd57c7e4c1de.json
similarity index 91%
rename from data/helm_classic/openai/gpt-3.5-turbo-0613/826d8e72-7332-48b1-af41-537e505c9e11.json
rename to data/helm_classic/openai/gpt-3.5-turbo-0613/039af363-0c5c-4e36-8396-cd57c7e4c1de.json
index bf7553bf6..5a9810e18 100644
--- a/data/helm_classic/openai/gpt-3.5-turbo-0613/826d8e72-7332-48b1-af41-537e505c9e11.json
+++ b/data/helm_classic/openai/gpt-3.5-turbo-0613/039af363-0c5c-4e36-8396-cd57c7e4c1de.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.783,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-ada-001/c34ec087-f3a1-49f1-8ff7-79f353171c4c.json b/data/helm_classic/openai/text-ada-001/8ea1facb-260a-461d-9271-2c07b318c46f.json
similarity index 94%
rename from data/helm_classic/openai/text-ada-001/c34ec087-f3a1-49f1-8ff7-79f353171c4c.json
rename to data/helm_classic/openai/text-ada-001/8ea1facb-260a-461d-9271-2c07b318c46f.json
index d1a92ef67..0e0d9602b 100644
--- a/data/helm_classic/openai/text-ada-001/c34ec087-f3a1-49f1-8ff7-79f353171c4c.json
+++ b/data/helm_classic/openai/text-ada-001/8ea1facb-260a-461d-9271-2c07b318c46f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_text-ada-001/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_text-ada-001/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.107,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-babbage-001/09763c40-c365-4be9-befc-970ce1886641.json b/data/helm_classic/openai/text-babbage-001/93007ac9-04c2-451d-abd2-2f235297747e.json
similarity index 94%
rename from data/helm_classic/openai/text-babbage-001/09763c40-c365-4be9-befc-970ce1886641.json
rename to data/helm_classic/openai/text-babbage-001/93007ac9-04c2-451d-abd2-2f235297747e.json
index fb51f6a42..734c00775 100644
--- a/data/helm_classic/openai/text-babbage-001/09763c40-c365-4be9-befc-970ce1886641.json
+++ b/data/helm_classic/openai/text-babbage-001/93007ac9-04c2-451d-abd2-2f235297747e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_text-babbage-001/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_text-babbage-001/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.229,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-curie-001/4ece7c38-114a-4973-ba13-ac3821c9836f.json b/data/helm_classic/openai/text-curie-001/b04e5f90-e46e-4d7a-a6a9-569bde072208.json
similarity index 94%
rename from data/helm_classic/openai/text-curie-001/4ece7c38-114a-4973-ba13-ac3821c9836f.json
rename to data/helm_classic/openai/text-curie-001/b04e5f90-e46e-4d7a-a6a9-569bde072208.json
index bb4d6e7ff..ba874427c 100644
--- a/data/helm_classic/openai/text-curie-001/4ece7c38-114a-4973-ba13-ac3821c9836f.json
+++ b/data/helm_classic/openai/text-curie-001/b04e5f90-e46e-4d7a-a6a9-569bde072208.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_text-curie-001/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_text-curie-001/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.36,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-davinci-002/75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json b/data/helm_classic/openai/text-davinci-002/933dc76f-45f0-48e0-93ae-3e19cff87c2a.json
similarity index 94%
rename from data/helm_classic/openai/text-davinci-002/75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json
rename to data/helm_classic/openai/text-davinci-002/933dc76f-45f0-48e0-93ae-3e19cff87c2a.json
index 4d9b820e6..4555e0f80 100644
--- a/data/helm_classic/openai/text-davinci-002/75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json
+++ b/data/helm_classic/openai/text-davinci-002/933dc76f-45f0-48e0-93ae-3e19cff87c2a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_text-davinci-002/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_text-davinci-002/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.905,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-davinci-003/0c43aeaf-c7d3-4e00-8b84-5115a6396585.json b/data/helm_classic/openai/text-davinci-003/b8408a64-eb89-4337-8ee5-3c48e4e24437.json
similarity index 94%
rename from data/helm_classic/openai/text-davinci-003/0c43aeaf-c7d3-4e00-8b84-5115a6396585.json
rename to data/helm_classic/openai/text-davinci-003/b8408a64-eb89-4337-8ee5-3c48e4e24437.json
index 437247369..7fd229e00 100644
--- a/data/helm_classic/openai/text-davinci-003/0c43aeaf-c7d3-4e00-8b84-5115a6396585.json
+++ b/data/helm_classic/openai/text-davinci-003/b8408a64-eb89-4337-8ee5-3c48e4e24437.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/openai_text-davinci-003/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/openai_text-davinci-003/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.872,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/stanford/Alpaca-7B/d25691b8-37e7-42ff-b59a-8684197280f1.json b/data/helm_classic/stanford/Alpaca-7B/d5846321-0800-4ff9-b85c-53c8b4884ba5.json
similarity index 91%
rename from data/helm_classic/stanford/Alpaca-7B/d25691b8-37e7-42ff-b59a-8684197280f1.json
rename to data/helm_classic/stanford/Alpaca-7B/d5846321-0800-4ff9-b85c-53c8b4884ba5.json
index 24ce27c0b..f68731052 100644
--- a/data/helm_classic/stanford/Alpaca-7B/d25691b8-37e7-42ff-b59a-8684197280f1.json
+++ b/data/helm_classic/stanford/Alpaca-7B/d5846321-0800-4ff9-b85c-53c8b4884ba5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/stanford_Alpaca-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/stanford_Alpaca-7B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.381,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json b/data/helm_classic/tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json
similarity index 91%
rename from data/helm_classic/tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json
rename to data/helm_classic/tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json
index a08e1b6ca..85693f897 100644
--- a/data/helm_classic/tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json
+++ b/data/helm_classic/tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/tii-uae_Falcon-40B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/tiiuae_Falcon-40B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Falcon 40B",
-    "id": "tii-uae/Falcon-40B",
-    "developer": "tii-uae",
+    "id": "tiiuae/Falcon-40B",
+    "developer": "tiiuae",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.729,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json b/data/helm_classic/tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json
similarity index 91%
rename from data/helm_classic/tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json
rename to data/helm_classic/tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json
index 0911bfafa..e165123de 100644
--- a/data/helm_classic/tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json
+++ b/data/helm_classic/tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/tii-uae_Falcon-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/tiiuae_Falcon-7B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Falcon 7B",
-    "id": "tii-uae/Falcon-7B",
-    "developer": "tii-uae",
+    "id": "tiiuae/Falcon-7B",
+    "developer": "tiiuae",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.378,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json b/data/helm_classic/tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json
similarity index 91%
rename from data/helm_classic/tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json
rename to data/helm_classic/tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json
index 99345e7ef..3c1369c88 100644
--- a/data/helm_classic/tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json
+++ b/data/helm_classic/tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/tii-uae_Falcon-Instruct-40B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-40B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Falcon-Instruct 40B",
-    "id": "tii-uae/Falcon-Instruct-40B",
-    "developer": "tii-uae",
+    "id": "tiiuae/Falcon-Instruct-40B",
+    "developer": "tiiuae",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.727,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json
similarity index 91%
rename from data/helm_classic/tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json
rename to data/helm_classic/tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json
index b0b75c2b1..19076bf3f 100644
--- a/data/helm_classic/tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json
+++ b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/tii-uae_Falcon-Instruct-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-7B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -13,13 +10,20 @@
   },
   "model_info": {
     "name": "Falcon-Instruct 7B",
-    "id": "tii-uae/Falcon-Instruct-7B",
-    "developer": "tii-uae",
+    "id": "tiiuae/Falcon-Instruct-7B",
+    "developer": "tiiuae",
     "inference_platform": "unknown"
   },
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.244,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-7B/8db87a70-babc-4776-8317-70752d3c5546.json b/data/helm_classic/together/RedPajama-INCITE-Base-7B/3a329574-dcf6-4177-b37c-c495e6af6cc5.json
similarity index 91%
rename from data/helm_classic/together/RedPajama-INCITE-Base-7B/8db87a70-babc-4776-8317-70752d3c5546.json
rename to data/helm_classic/together/RedPajama-INCITE-Base-7B/3a329574-dcf6-4177-b37c-c495e6af6cc5.json
index 66ae49567..90ced7618 100644
--- a/data/helm_classic/together/RedPajama-INCITE-Base-7B/8db87a70-babc-4776-8317-70752d3c5546.json
+++ b/data/helm_classic/together/RedPajama-INCITE-Base-7B/3a329574-dcf6-4177-b37c-c495e6af6cc5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.378,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/3da308fb-2403-432e-bde3-3b14af627552.json b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/9e662c1e-e77c-4fb3-b589-127683a4b2ca.json
similarity index 91%
rename from data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/3da308fb-2403-432e-bde3-3b14af627552.json
rename to data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/9e662c1e-e77c-4fb3-b589-127683a4b2ca.json
index f09058f3c..858c06ee0 100644
--- a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/3da308fb-2403-432e-bde3-3b14af627552.json
+++ b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/9e662c1e-e77c-4fb3-b589-127683a4b2ca.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.311,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/fd8f7b08-813c-4369-bfe4-d86eacc874ea.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/375140f6-bd3f-4b55-a35c-23de37254296.json
similarity index 91%
rename from data/helm_classic/together/RedPajama-INCITE-Instruct-7B/fd8f7b08-813c-4369-bfe4-d86eacc874ea.json
rename to data/helm_classic/together/RedPajama-INCITE-Instruct-7B/375140f6-bd3f-4b55-a35c-23de37254296.json
index 9ed3b7bf9..e246416bd 100644
--- a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/fd8f7b08-813c-4369-bfe4-d86eacc874ea.json
+++ b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/375140f6-bd3f-4b55-a35c-23de37254296.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.524,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/021d0b25-8f58-47da-a58c-ac532a7972bf.json
similarity index 91%
rename from data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json
rename to data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/021d0b25-8f58-47da-a58c-ac532a7972bf.json
index bb56f1198..828ab9683 100644
--- a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json
+++ b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/021d0b25-8f58-47da-a58c-ac532a7972bf.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.366,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/writer/InstructPalmyra-30B/bcf54365-b229-4abf-8ff8-59b4b46fa829.json b/data/helm_classic/writer/InstructPalmyra-30B/9207fec4-d0c4-4f66-b917-f5ed57409215.json
similarity index 91%
rename from data/helm_classic/writer/InstructPalmyra-30B/bcf54365-b229-4abf-8ff8-59b4b46fa829.json
rename to data/helm_classic/writer/InstructPalmyra-30B/9207fec4-d0c4-4f66-b917-f5ed57409215.json
index add4859be..0d4ab9c94 100644
--- a/data/helm_classic/writer/InstructPalmyra-30B/bcf54365-b229-4abf-8ff8-59b4b46fa829.json
+++ b/data/helm_classic/writer/InstructPalmyra-30B/9207fec4-d0c4-4f66-b917-f5ed57409215.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.568,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/yandex/YaLM-100B/eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json b/data/helm_classic/yandex/YaLM-100B/b04c8845-cccf-4856-9597-ab283bb2ec8d.json
similarity index 91%
rename from data/helm_classic/yandex/YaLM-100B/eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json
rename to data/helm_classic/yandex/YaLM-100B/b04c8845-cccf-4856-9597-ab283bb2ec8d.json
index 74662144a..4b439ab57 100644
--- a/data/helm_classic/yandex/YaLM-100B/eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json
+++ b/data/helm_classic/yandex/YaLM-100B/b04c8845-cccf-4856-9597-ab283bb2ec8d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/yandex_YaLM-100B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/yandex_YaLM-100B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.075,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_classic/zhipu-ai/GLM-130B/f45719e5-3334-4e1d-8a83-f5f8292cb977.json b/data/helm_classic/zhipu-ai/GLM-130B/4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json
similarity index 91%
rename from data/helm_classic/zhipu-ai/GLM-130B/f45719e5-3334-4e1d-8a83-f5f8292cb977.json
rename to data/helm_classic/zhipu-ai/GLM-130B/4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json
index 2f17c575d..67e0f75ce 100644
--- a/data/helm_classic/zhipu-ai/GLM-130B/f45719e5-3334-4e1d-8a83-f5f8292cb977.json
+++ b/data/helm_classic/zhipu-ai/GLM-130B/4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1768090731.5328572",
-  "retrieved_timestamp": "1768090731.5328572",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1770830385.7573261",
+  "retrieved_timestamp": "1770830385.7573261",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_classic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.512,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Calibration": {
             "description": null,
@@ -74,10 +77,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -142,10 +154,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "BoolQ - EM",
+      "source_data": {
+        "dataset_name": "BoolQ",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -235,10 +256,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -328,10 +358,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (open-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -496,10 +535,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "QuAC - F1",
+      "source_data": {
+        "dataset_name": "QuAC",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -589,10 +637,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "HellaSwag - EM",
+      "source_data": {
+        "dataset_name": "HellaSwag",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -657,10 +714,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -725,10 +791,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "TruthfulQA - EM",
+      "source_data": {
+        "dataset_name": "TruthfulQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -793,10 +868,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "source_data": {
+        "dataset_name": "MS MARCO (TREC)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
         "lower_is_better": false,
@@ -951,10 +1035,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "source_data": {
+        "dataset_name": "CNN/DailyMail",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1074,10 +1167,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "XSUM - ROUGE-2",
+      "source_data": {
+        "dataset_name": "XSUM",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
         "lower_is_better": false,
@@ -1197,10 +1299,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "IMDB - EM",
+      "source_data": {
+        "dataset_name": "IMDB",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1290,10 +1401,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "CivilComments - EM",
+      "source_data": {
+        "dataset_name": "CivilComments",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1383,10 +1503,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "RAFT - EM",
+      "source_data": {
+        "dataset_name": "RAFT",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -1476,7 +1605,9 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_instruct/anthropic/claude-v1.3/c4e55239-581b-433f-82bc-68a690f59e4a.json b/data/helm_instruct/anthropic/claude-v1.3/0e30e895-aaf7-42d4-95db-7541d6b41c87.json
similarity index 61%
rename from data/helm_instruct/anthropic/claude-v1.3/c4e55239-581b-433f-82bc-68a690f59e4a.json
rename to data/helm_instruct/anthropic/claude-v1.3/0e30e895-aaf7-42d4-95db-7541d6b41c87.json
index 841d52f14..abd3e5e5b 100644
--- a/data/helm_instruct/anthropic/claude-v1.3/c4e55239-581b-433f-82bc-68a690f59e4a.json
+++ b/data/helm_instruct/anthropic/claude-v1.3/0e30e895-aaf7-42d4-95db-7541d6b41c87.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1768085895.632564",
-  "retrieved_timestamp": "1768085895.632564",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1770830411.78817",
+  "retrieved_timestamp": "1770830411.78817",
   "source_metadata": {
     "source_name": "helm_instruct",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,14 +34,22 @@
       "score_details": {
         "score": 0.611,
         "details": {
-          "description": null,
           "tab": "Instruction Following"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "Anthropic RLHF dataset - Harmlessness",
+      "source_data": {
+        "dataset_name": "Anthropic RLHF dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -53,30 +65,39 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "hh",
-          "hh",
-          "hh",
-          "hh",
-          "red_team",
-          "red_team",
-          "red_team",
-          "red_team"
-        ],
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale",
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "subset": [
+            "hh",
+            "hh",
+            "hh",
+            "hh",
+            "red_team",
+            "red_team",
+            "red_team",
+            "red_team"
+          ],
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale",
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Best ChatGPT Prompts - Harmlessness",
+      "source_data": {
+        "dataset_name": "Best ChatGPT Prompts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -92,18 +113,27 @@
         }
       },
       "generation_config": {
-        "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
-        "tags": "",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
+          "tags": "",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Koala test dataset - Harmlessness",
+      "source_data": {
+        "dataset_name": "Koala test dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -119,16 +149,25 @@
         }
       },
       "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Open Assistant - Harmlessness",
+      "source_data": {
+        "dataset_name": "Open Assistant",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -144,17 +183,26 @@
         }
       },
       "generation_config": {
-        "language": "en",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "language": "en",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Self Instruct - Harmlessness",
+      "source_data": {
+        "dataset_name": "Self Instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -170,16 +218,25 @@
         }
       },
       "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Vicuna - Harmlessness",
+      "source_data": {
+        "dataset_name": "Vicuna",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -195,13 +252,15 @@
         }
       },
       "generation_config": {
-        "category": "all",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "category": "all",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_instruct/cohere/command-xlarge-beta/8a68cccf-2965-4867-b922-460cc5b695de.json b/data/helm_instruct/cohere/command-xlarge-beta/4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json
similarity index 61%
rename from data/helm_instruct/cohere/command-xlarge-beta/8a68cccf-2965-4867-b922-460cc5b695de.json
rename to data/helm_instruct/cohere/command-xlarge-beta/4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json
index 0905e2f21..3aea06a21 100644
--- a/data/helm_instruct/cohere/command-xlarge-beta/8a68cccf-2965-4867-b922-460cc5b695de.json
+++ b/data/helm_instruct/cohere/command-xlarge-beta/4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1768085895.632564",
-  "retrieved_timestamp": "1768085895.632564",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1770830411.78817",
+  "retrieved_timestamp": "1770830411.78817",
   "source_metadata": {
     "source_name": "helm_instruct",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Cohere Command beta (52.4B)",
+    "name": "Cohere Command beta 52.4B",
     "id": "cohere/command-xlarge-beta",
     "developer": "cohere",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,14 +34,22 @@
       "score_details": {
         "score": 0.089,
         "details": {
-          "description": null,
           "tab": "Instruction Following"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "Anthropic RLHF dataset - Harmlessness",
+      "source_data": {
+        "dataset_name": "Anthropic RLHF dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -53,30 +65,39 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "hh",
-          "hh",
-          "hh",
-          "hh",
-          "red_team",
-          "red_team",
-          "red_team",
-          "red_team"
-        ],
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale",
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "subset": [
+            "hh",
+            "hh",
+            "hh",
+            "hh",
+            "red_team",
+            "red_team",
+            "red_team",
+            "red_team"
+          ],
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale",
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Best ChatGPT Prompts - Harmlessness",
+      "source_data": {
+        "dataset_name": "Best ChatGPT Prompts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -92,18 +113,27 @@
         }
       },
       "generation_config": {
-        "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
-        "tags": "",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
+          "tags": "",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Koala test dataset - Harmlessness",
+      "source_data": {
+        "dataset_name": "Koala test dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -119,16 +149,25 @@
         }
       },
       "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Open Assistant - Harmlessness",
+      "source_data": {
+        "dataset_name": "Open Assistant",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -144,17 +183,26 @@
         }
       },
       "generation_config": {
-        "language": "en",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "language": "en",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Self Instruct - Harmlessness",
+      "source_data": {
+        "dataset_name": "Self Instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -170,16 +218,25 @@
         }
       },
       "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Vicuna - Harmlessness",
+      "source_data": {
+        "dataset_name": "Vicuna",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -195,13 +252,15 @@
         }
       },
       "generation_config": {
-        "category": "all",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "category": "all",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_instruct/openai/gpt-3.5-turbo-0613/a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json b/data/helm_instruct/openai/gpt-3.5-turbo-0613/8befd29c-a16d-4e05-a92f-00b621d45e03.json
similarity index 61%
rename from data/helm_instruct/openai/gpt-3.5-turbo-0613/a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json
rename to data/helm_instruct/openai/gpt-3.5-turbo-0613/8befd29c-a16d-4e05-a92f-00b621d45e03.json
index 4dc9e1ef5..31fd0891a 100644
--- a/data/helm_instruct/openai/gpt-3.5-turbo-0613/a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json
+++ b/data/helm_instruct/openai/gpt-3.5-turbo-0613/8befd29c-a16d-4e05-a92f-00b621d45e03.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1768085895.632564",
-  "retrieved_timestamp": "1768085895.632564",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1770830411.78817",
+  "retrieved_timestamp": "1770830411.78817",
   "source_metadata": {
     "source_name": "helm_instruct",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-3.5 Turbo (0613)",
+    "name": "GPT-3.5 Turbo 0613",
     "id": "openai/gpt-3.5-turbo-0613",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,14 +34,22 @@
       "score_details": {
         "score": 0.689,
         "details": {
-          "description": null,
           "tab": "Instruction Following"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "Anthropic RLHF dataset - Harmlessness",
+      "source_data": {
+        "dataset_name": "Anthropic RLHF dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -53,30 +65,39 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "hh",
-          "hh",
-          "hh",
-          "hh",
-          "red_team",
-          "red_team",
-          "red_team",
-          "red_team"
-        ],
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale",
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "subset": [
+            "hh",
+            "hh",
+            "hh",
+            "hh",
+            "red_team",
+            "red_team",
+            "red_team",
+            "red_team"
+          ],
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale",
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Best ChatGPT Prompts - Harmlessness",
+      "source_data": {
+        "dataset_name": "Best ChatGPT Prompts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -92,18 +113,27 @@
         }
       },
       "generation_config": {
-        "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
-        "tags": "",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
+          "tags": "",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Koala test dataset - Harmlessness",
+      "source_data": {
+        "dataset_name": "Koala test dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -119,16 +149,25 @@
         }
       },
       "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Open Assistant - Harmlessness",
+      "source_data": {
+        "dataset_name": "Open Assistant",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -144,17 +183,26 @@
         }
       },
       "generation_config": {
-        "language": "en",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "language": "en",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Self Instruct - Harmlessness",
+      "source_data": {
+        "dataset_name": "Self Instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -170,16 +218,25 @@
         }
       },
       "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Vicuna - Harmlessness",
+      "source_data": {
+        "dataset_name": "Vicuna",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -195,13 +252,15 @@
         }
       },
       "generation_config": {
-        "category": "all",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "category": "all",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_instruct/openai/gpt-4-0314/d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json b/data/helm_instruct/openai/gpt-4-0314/b2e193b8-215b-4e80-9d5a-df11f1dac88a.json
similarity index 61%
rename from data/helm_instruct/openai/gpt-4-0314/d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json
rename to data/helm_instruct/openai/gpt-4-0314/b2e193b8-215b-4e80-9d5a-df11f1dac88a.json
index f76268b07..ac8e25cb0 100644
--- a/data/helm_instruct/openai/gpt-4-0314/d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json
+++ b/data/helm_instruct/openai/gpt-4-0314/b2e193b8-215b-4e80-9d5a-df11f1dac88a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_instruct/openai_gpt-4-0314/1768085895.632564",
-  "retrieved_timestamp": "1768085895.632564",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_instruct/openai_gpt-4-0314/1770830411.78817",
+  "retrieved_timestamp": "1770830411.78817",
   "source_metadata": {
     "source_name": "helm_instruct",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 (0314)",
+    "name": "GPT-4 0314",
     "id": "openai/gpt-4-0314",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperform on average (over columns).",
         "lower_is_better": false,
@@ -30,14 +34,22 @@
       "score_details": {
         "score": 0.611,
         "details": {
-          "description": null,
           "tab": "Instruction Following"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "Anthropic RLHF dataset - Harmlessness",
+      "source_data": {
+        "dataset_name": "Anthropic RLHF dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -53,30 +65,39 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "hh",
-          "hh",
-          "hh",
-          "hh",
-          "red_team",
-          "red_team",
-          "red_team",
-          "red_team"
-        ],
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale",
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "subset": [
+            "hh",
+            "hh",
+            "hh",
+            "hh",
+            "red_team",
+            "red_team",
+            "red_team",
+            "red_team"
+          ],
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale",
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Best ChatGPT Prompts - Harmlessness",
+      "source_data": {
+        "dataset_name": "Best ChatGPT Prompts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -92,18 +113,27 @@
         }
       },
       "generation_config": {
-        "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
-        "tags": "",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
+          "tags": "",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Koala test dataset - Harmlessness",
+      "source_data": {
+        "dataset_name": "Koala test dataset",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -119,16 +149,25 @@
         }
       },
       "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Open Assistant - Harmlessness",
+      "source_data": {
+        "dataset_name": "Open Assistant",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -144,17 +183,26 @@
         }
       },
       "generation_config": {
-        "language": "en",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "language": "en",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Self Instruct - Harmlessness",
+      "source_data": {
+        "dataset_name": "Self Instruct",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -170,16 +218,25 @@
         }
       },
       "generation_config": {
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Vicuna - Harmlessness",
+      "source_data": {
+        "dataset_name": "Vicuna",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
         "lower_is_better": false,
@@ -195,13 +252,15 @@
         }
       },
       "generation_config": {
-        "category": "all",
-        "evaluator": [
-          "claude",
-          "gpt4",
-          "mturk",
-          "scale"
-        ]
+        "additional_details": {
+          "category": "all",
+          "evaluator": [
+            "claude",
+            "gpt4",
+            "mturk",
+            "scale"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/01-ai/yi-34b/3b8567cf-40f0-4d63-ad12-9b1712a2c503.json b/data/helm_lite/01-ai/yi-34b/eedd0f38-6d26-4297-a469-291227ec6be6.json
similarity index 82%
rename from data/helm_lite/01-ai/yi-34b/3b8567cf-40f0-4d63-ad12-9b1712a2c503.json
rename to data/helm_lite/01-ai/yi-34b/eedd0f38-6d26-4297-a469-291227ec6be6.json
index 8d9b0c6e7..96c3d4d2d 100644
--- a/data/helm_lite/01-ai/yi-34b/3b8567cf-40f0-4d63-ad12-9b1712a2c503.json
+++ b/data/helm_lite/01-ai/yi-34b/eedd0f38-6d26-4297-a469-291227ec6be6.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/01-ai_yi-34b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/01-ai_yi-34b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Yi (34B)",
+    "name": "Yi 34B",
     "id": "01-ai/yi-34b",
     "developer": "01-ai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.57,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/01-ai/yi-6b/3b94c757-b54d-462c-a2a1-d331711a0833.json b/data/helm_lite/01-ai/yi-6b/74c47665-740f-4784-8a27-1c1d1c29bff8.json
similarity index 82%
rename from data/helm_lite/01-ai/yi-6b/3b94c757-b54d-462c-a2a1-d331711a0833.json
rename to data/helm_lite/01-ai/yi-6b/74c47665-740f-4784-8a27-1c1d1c29bff8.json
index 04e690e09..497d98a06 100644
--- a/data/helm_lite/01-ai/yi-6b/3b94c757-b54d-462c-a2a1-d331711a0833.json
+++ b/data/helm_lite/01-ai/yi-6b/74c47665-740f-4784-8a27-1c1d1c29bff8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/01-ai_yi-6b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/01-ai_yi-6b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Yi (6B)",
+    "name": "Yi 6B",
     "id": "01-ai/yi-6b",
     "developer": "01-ai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.253,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/01-ai/yi-large-preview/3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json b/data/helm_lite/01-ai/yi-large-preview/8027b577-7f48-4df5-9879-bd45ac342f42.json
similarity index 82%
rename from data/helm_lite/01-ai/yi-large-preview/3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json
rename to data/helm_lite/01-ai/yi-large-preview/8027b577-7f48-4df5-9879-bd45ac342f42.json
index 6d66d647a..7bea38ffb 100644
--- a/data/helm_lite/01-ai/yi-large-preview/3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json
+++ b/data/helm_lite/01-ai/yi-large-preview/8027b577-7f48-4df5-9879-bd45ac342f42.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/01-ai_yi-large-preview/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/01-ai_yi-large-preview/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Yi Large (Preview)",
+    "name": "Yi Large Preview",
     "id": "01-ai/yi-large-preview",
     "developer": "01-ai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.471,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/AlephAlpha/luminous-base/b4fa23d2-48cd-4a58-b70d-25b466781008.json b/data/helm_lite/AlephAlpha/luminous-base/e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json
similarity index 82%
rename from data/helm_lite/AlephAlpha/luminous-base/b4fa23d2-48cd-4a58-b70d-25b466781008.json
rename to data/helm_lite/AlephAlpha/luminous-base/e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json
index 4d89d0b52..00a6f037c 100644
--- a/data/helm_lite/AlephAlpha/luminous-base/b4fa23d2-48cd-4a58-b70d-25b466781008.json
+++ b/data/helm_lite/AlephAlpha/luminous-base/e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Luminous Base (13B)",
+    "name": "Luminous Base 13B",
     "id": "AlephAlpha/luminous-base",
     "developer": "AlephAlpha",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.041,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/AlephAlpha/luminous-extended/818cfaa1-815b-4a13-b017-5e6c30ed9de3.json b/data/helm_lite/AlephAlpha/luminous-extended/24e11e7b-15d6-4a09-9545-38486d0eb236.json
similarity index 82%
rename from data/helm_lite/AlephAlpha/luminous-extended/818cfaa1-815b-4a13-b017-5e6c30ed9de3.json
rename to data/helm_lite/AlephAlpha/luminous-extended/24e11e7b-15d6-4a09-9545-38486d0eb236.json
index 74581377a..215983cef 100644
--- a/data/helm_lite/AlephAlpha/luminous-extended/818cfaa1-815b-4a13-b017-5e6c30ed9de3.json
+++ b/data/helm_lite/AlephAlpha/luminous-extended/24e11e7b-15d6-4a09-9545-38486d0eb236.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Luminous Extended (30B)",
+    "name": "Luminous Extended 30B",
     "id": "AlephAlpha/luminous-extended",
     "developer": "AlephAlpha",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.078,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/AlephAlpha/luminous-supreme/62727554-ab2c-4218-9c3c-3eba48420834.json b/data/helm_lite/AlephAlpha/luminous-supreme/eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json
similarity index 82%
rename from data/helm_lite/AlephAlpha/luminous-supreme/62727554-ab2c-4218-9c3c-3eba48420834.json
rename to data/helm_lite/AlephAlpha/luminous-supreme/eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json
index 9f7e37eaf..81f487c09 100644
--- a/data/helm_lite/AlephAlpha/luminous-supreme/62727554-ab2c-4218-9c3c-3eba48420834.json
+++ b/data/helm_lite/AlephAlpha/luminous-supreme/eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Luminous Supreme (70B)",
+    "name": "Luminous Supreme 70B",
     "id": "AlephAlpha/luminous-supreme",
     "developer": "AlephAlpha",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.145,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/ai21/j2-grande/c58c4299-ede8-46b6-8d33-2f900c272853.json b/data/helm_lite/ai21/j2-grande/52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json
similarity index 82%
rename from data/helm_lite/ai21/j2-grande/c58c4299-ede8-46b6-8d33-2f900c272853.json
rename to data/helm_lite/ai21/j2-grande/52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json
index 9efa2b824..ef3567598 100644
--- a/data/helm_lite/ai21/j2-grande/c58c4299-ede8-46b6-8d33-2f900c272853.json
+++ b/data/helm_lite/ai21/j2-grande/52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/ai21_j2-grande/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/ai21_j2-grande/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Jurassic-2 Grande (17B)",
+    "name": "Jurassic-2 Grande 17B",
     "id": "ai21/j2-grande",
     "developer": "ai21",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.172,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/ai21/j2-jumbo/bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json b/data/helm_lite/ai21/j2-jumbo/68713712-ae92-474b-84c0-1b8301538439.json
similarity index 82%
rename from data/helm_lite/ai21/j2-jumbo/bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json
rename to data/helm_lite/ai21/j2-jumbo/68713712-ae92-474b-84c0-1b8301538439.json
index 1c64f2731..f39f9c93e 100644
--- a/data/helm_lite/ai21/j2-jumbo/bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json
+++ b/data/helm_lite/ai21/j2-jumbo/68713712-ae92-474b-84c0-1b8301538439.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/ai21_j2-jumbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/ai21_j2-jumbo/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Jurassic-2 Jumbo (178B)",
+    "name": "Jurassic-2 Jumbo 178B",
     "id": "ai21/j2-jumbo",
     "developer": "ai21",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.215,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/ai21/jamba-1.5-large/38918b97-2707-4b53-99a8-7a67816f398c.json b/data/helm_lite/ai21/jamba-1.5-large/15cc9411-6ea4-4f10-831f-23ff27fd5704.json
similarity index 82%
rename from data/helm_lite/ai21/jamba-1.5-large/38918b97-2707-4b53-99a8-7a67816f398c.json
rename to data/helm_lite/ai21/jamba-1.5-large/15cc9411-6ea4-4f10-831f-23ff27fd5704.json
index 634cd87ae..d7dee0e9a 100644
--- a/data/helm_lite/ai21/jamba-1.5-large/38918b97-2707-4b53-99a8-7a67816f398c.json
+++ b/data/helm_lite/ai21/jamba-1.5-large/15cc9411-6ea4-4f10-831f-23ff27fd5704.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.637,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/ai21/jamba-1.5-mini/82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json b/data/helm_lite/ai21/jamba-1.5-mini/3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json
similarity index 82%
rename from data/helm_lite/ai21/jamba-1.5-mini/82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json
rename to data/helm_lite/ai21/jamba-1.5-mini/3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json
index 3483b0b9a..f65e65120 100644
--- a/data/helm_lite/ai21/jamba-1.5-mini/82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json
+++ b/data/helm_lite/ai21/jamba-1.5-mini/3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.414,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/ai21/jamba-instruct/9278a23a-cecd-446c-b234-2301e1e44c40.json b/data/helm_lite/ai21/jamba-instruct/1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json
similarity index 82%
rename from data/helm_lite/ai21/jamba-instruct/9278a23a-cecd-446c-b234-2301e1e44c40.json
rename to data/helm_lite/ai21/jamba-instruct/1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json
index 527fb50a5..a3e5bda34 100644
--- a/data/helm_lite/ai21/jamba-instruct/9278a23a-cecd-446c-b234-2301e1e44c40.json
+++ b/data/helm_lite/ai21/jamba-instruct/1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/ai21_jamba-instruct/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/ai21_jamba-instruct/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.287,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,12 +628,14 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/allenai/olmo-7b/81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json b/data/helm_lite/allenai/olmo-7b/078d812b-2198-4497-8fbe-06fb640fd86d.json
similarity index 82%
rename from data/helm_lite/allenai/olmo-7b/81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json
rename to data/helm_lite/allenai/olmo-7b/078d812b-2198-4497-8fbe-06fb640fd86d.json
index 51634a355..51375c00c 100644
--- a/data/helm_lite/allenai/olmo-7b/81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json
+++ b/data/helm_lite/allenai/olmo-7b/078d812b-2198-4497-8fbe-06fb640fd86d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/allenai_olmo-7b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/allenai_olmo-7b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "OLMo (7B)",
+    "name": "OLMo 7B",
     "id": "allenai/olmo-7b",
     "developer": "allenai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.052,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/amazon/nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json b/data/helm_lite/amazon/nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json
similarity index 82%
rename from data/helm_lite/amazon/nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json
rename to data/helm_lite/amazon/nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json
index e8381a3f3..289dc9306 100644
--- a/data/helm_lite/amazon/nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json
+++ b/data/helm_lite/amazon/nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.708,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,18 +506,27 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -496,10 +571,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/amazon/nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json b/data/helm_lite/amazon/nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json
similarity index 82%
rename from data/helm_lite/amazon/nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json
rename to data/helm_lite/amazon/nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json
index 8fb5d6b37..bcd94c63d 100644
--- a/data/helm_lite/amazon/nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json
+++ b/data/helm_lite/amazon/nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.524,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,18 +506,27 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -496,10 +571,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/amazon/nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json b/data/helm_lite/amazon/nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json
similarity index 82%
rename from data/helm_lite/amazon/nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json
rename to data/helm_lite/amazon/nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json
index 52c65584f..c8589b186 100644
--- a/data/helm_lite/amazon/nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json
+++ b/data/helm_lite/amazon/nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.885,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,18 +506,27 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -496,10 +571,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-2.0/b2b9e87c-76de-4716-8d28-4b13a34c360f.json b/data/helm_lite/anthropic/claude-2.0/0684c1d2-ea43-4341-820c-09051f5e11f2.json
similarity index 82%
rename from data/helm_lite/anthropic/claude-2.0/b2b9e87c-76de-4716-8d28-4b13a34c360f.json
rename to data/helm_lite/anthropic/claude-2.0/0684c1d2-ea43-4341-820c-09051f5e11f2.json
index b883ce7c5..2f9d0f3e2 100644
--- a/data/helm_lite/anthropic/claude-2.0/b2b9e87c-76de-4716-8d28-4b13a34c360f.json
+++ b/data/helm_lite/anthropic/claude-2.0/0684c1d2-ea43-4341-820c-09051f5e11f2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-2.0/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-2.0/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.489,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-2.1/0bd11df6-a037-4f55-a78a-cc23c34c0958.json b/data/helm_lite/anthropic/claude-2.1/51821ca1-7eac-4094-abac-98b2484cc5a0.json
similarity index 82%
rename from data/helm_lite/anthropic/claude-2.1/0bd11df6-a037-4f55-a78a-cc23c34c0958.json
rename to data/helm_lite/anthropic/claude-2.1/51821ca1-7eac-4094-abac-98b2484cc5a0.json
index 388a1840c..fe9f851b2 100644
--- a/data/helm_lite/anthropic/claude-2.1/0bd11df6-a037-4f55-a78a-cc23c34c0958.json
+++ b/data/helm_lite/anthropic/claude-2.1/51821ca1-7eac-4094-abac-98b2484cc5a0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-2.1/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-2.1/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.437,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/f4061c6a-f82f-4642-a734-f6adb0be7519.json b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/8a0f5749-7f6a-4813-9c08-7283433c1337.json
similarity index 82%
rename from data/helm_lite/anthropic/claude-3-5-haiku-20241022/f4061c6a-f82f-4642-a734-f6adb0be7519.json
rename to data/helm_lite/anthropic/claude-3-5-haiku-20241022/8a0f5749-7f6a-4813-9c08-7283433c1337.json
index 231b91f4e..9eecf8a25 100644
--- a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/f4061c6a-f82f-4642-a734-f6adb0be7519.json
+++ b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/8a0f5749-7f6a-4813-9c08-7283433c1337.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Haiku (20241022)",
+    "name": "Claude 3.5 Haiku 20241022",
     "id": "anthropic/claude-3-5-haiku-20241022",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.531,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,18 +506,27 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -496,10 +571,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/4697983d-a29a-484d-9268-7974117456e8.json
similarity index 82%
rename from data/helm_lite/anthropic/claude-3-5-sonnet-20240620/18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json
rename to data/helm_lite/anthropic/claude-3-5-sonnet-20240620/4697983d-a29a-484d-9268-7974117456e8.json
index 0ee2e76e5..f3aab2968 100644
--- a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json
+++ b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/4697983d-a29a-484d-9268-7974117456e8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Sonnet (20240620)",
+    "name": "Claude 3.5 Sonnet 20240620",
     "id": "anthropic/claude-3-5-sonnet-20240620",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.885,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/d0cd5626-5b2c-46df-b265-e130a789a0e7.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/60e33aa3-0593-42e6-9baa-8311746deca0.json
similarity index 82%
rename from data/helm_lite/anthropic/claude-3-5-sonnet-20241022/d0cd5626-5b2c-46df-b265-e130a789a0e7.json
rename to data/helm_lite/anthropic/claude-3-5-sonnet-20241022/60e33aa3-0593-42e6-9baa-8311746deca0.json
index d816a8a2a..6a814b17d 100644
--- a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/d0cd5626-5b2c-46df-b265-e130a789a0e7.json
+++ b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/60e33aa3-0593-42e6-9baa-8311746deca0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Sonnet (20241022)",
+    "name": "Claude 3.5 Sonnet 20241022",
     "id": "anthropic/claude-3-5-sonnet-20241022",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.846,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-3-haiku-20240307/3eea5b0f-1126-448f-94e5-52a874baa61a.json b/data/helm_lite/anthropic/claude-3-haiku-20240307/2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json
similarity index 82%
rename from data/helm_lite/anthropic/claude-3-haiku-20240307/3eea5b0f-1126-448f-94e5-52a874baa61a.json
rename to data/helm_lite/anthropic/claude-3-haiku-20240307/2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json
index 66e3c14b8..54328bd79 100644
--- a/data/helm_lite/anthropic/claude-3-haiku-20240307/3eea5b0f-1126-448f-94e5-52a874baa61a.json
+++ b/data/helm_lite/anthropic/claude-3-haiku-20240307/2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3 Haiku (20240307)",
+    "name": "Claude 3 Haiku 20240307",
     "id": "anthropic/claude-3-haiku-20240307",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.263,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-3-opus-20240229/9fa44303-4699-47f2-9777-0c118e36d87e.json b/data/helm_lite/anthropic/claude-3-opus-20240229/9ad91ee2-7a64-4f94-9166-f2681777023b.json
similarity index 82%
rename from data/helm_lite/anthropic/claude-3-opus-20240229/9fa44303-4699-47f2-9777-0c118e36d87e.json
rename to data/helm_lite/anthropic/claude-3-opus-20240229/9ad91ee2-7a64-4f94-9166-f2681777023b.json
index 27c9ec758..ad60ccaa3 100644
--- a/data/helm_lite/anthropic/claude-3-opus-20240229/9fa44303-4699-47f2-9777-0c118e36d87e.json
+++ b/data/helm_lite/anthropic/claude-3-opus-20240229/9ad91ee2-7a64-4f94-9166-f2681777023b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3 Opus (20240229)",
+    "name": "Claude 3 Opus 20240229",
     "id": "anthropic/claude-3-opus-20240229",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.683,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-3-sonnet-20240229/a2d019d6-52bf-439f-90f0-74583928e5c0.json b/data/helm_lite/anthropic/claude-3-sonnet-20240229/4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json
similarity index 82%
rename from data/helm_lite/anthropic/claude-3-sonnet-20240229/a2d019d6-52bf-439f-90f0-74583928e5c0.json
rename to data/helm_lite/anthropic/claude-3-sonnet-20240229/4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json
index 3cbea3718..35374c2f9 100644
--- a/data/helm_lite/anthropic/claude-3-sonnet-20240229/a2d019d6-52bf-439f-90f0-74583928e5c0.json
+++ b/data/helm_lite/anthropic/claude-3-sonnet-20240229/4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3 Sonnet (20240229)",
+    "name": "Claude 3 Sonnet 20240229",
     "id": "anthropic/claude-3-sonnet-20240229",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.377,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-instant-1.2/0f884c98-ea5e-4409-81e2-40aa5c84f99d.json b/data/helm_lite/anthropic/claude-instant-1.2/64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json
similarity index 82%
rename from data/helm_lite/anthropic/claude-instant-1.2/0f884c98-ea5e-4409-81e2-40aa5c84f99d.json
rename to data/helm_lite/anthropic/claude-instant-1.2/64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json
index a1592f60e..7dbf7e9ee 100644
--- a/data/helm_lite/anthropic/claude-instant-1.2/0f884c98-ea5e-4409-81e2-40aa5c84f99d.json
+++ b/data/helm_lite/anthropic/claude-instant-1.2/64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.399,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/anthropic/claude-v1.3/2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json b/data/helm_lite/anthropic/claude-v1.3/fe8a36b0-4361-461b-b310-656c54131fa6.json
similarity index 82%
rename from data/helm_lite/anthropic/claude-v1.3/2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json
rename to data/helm_lite/anthropic/claude-v1.3/fe8a36b0-4361-461b-b310-656c54131fa6.json
index e73713e6a..04da077b3 100644
--- a/data/helm_lite/anthropic/claude-v1.3/2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json
+++ b/data/helm_lite/anthropic/claude-v1.3/fe8a36b0-4361-461b-b310-656c54131fa6.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/anthropic_claude-v1.3/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/anthropic_claude-v1.3/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.518,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/cohere/command-light/8c312031-5da7-4816-8207-056fe1bc161d.json b/data/helm_lite/cohere/command-light/b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json
similarity index 82%
rename from data/helm_lite/cohere/command-light/8c312031-5da7-4816-8207-056fe1bc161d.json
rename to data/helm_lite/cohere/command-light/b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json
index aabe52512..b4ccf63fd 100644
--- a/data/helm_lite/cohere/command-light/8c312031-5da7-4816-8207-056fe1bc161d.json
+++ b/data/helm_lite/cohere/command-light/b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/cohere_command-light/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/cohere_command-light/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.105,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/cohere/command-r-plus/71c0558f-7b56-40ea-a1be-2749b88758c7.json b/data/helm_lite/cohere/command-r-plus/67967a2a-5fb4-46e8-b1ec-eda1588d9086.json
similarity index 82%
rename from data/helm_lite/cohere/command-r-plus/71c0558f-7b56-40ea-a1be-2749b88758c7.json
rename to data/helm_lite/cohere/command-r-plus/67967a2a-5fb4-46e8-b1ec-eda1588d9086.json
index 288bdd798..e941df44c 100644
--- a/data/helm_lite/cohere/command-r-plus/71c0558f-7b56-40ea-a1be-2749b88758c7.json
+++ b/data/helm_lite/cohere/command-r-plus/67967a2a-5fb4-46e8-b1ec-eda1588d9086.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/cohere_command-r-plus/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/cohere_command-r-plus/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.441,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/cohere/command-r/d1330068-2c16-450e-8ce5-1d05f5e842d9.json b/data/helm_lite/cohere/command-r/0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json
similarity index 82%
rename from data/helm_lite/cohere/command-r/d1330068-2c16-450e-8ce5-1d05f5e842d9.json
rename to data/helm_lite/cohere/command-r/0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json
index 33b212443..2314d1d0d 100644
--- a/data/helm_lite/cohere/command-r/d1330068-2c16-450e-8ce5-1d05f5e842d9.json
+++ b/data/helm_lite/cohere/command-r/0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/cohere_command-r/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/cohere_command-r/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.299,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/cohere/command/dec04718-1ae9-4e4b-92da-01d789424f69.json b/data/helm_lite/cohere/command/ba5eea81-2120-4a20-8322-dfbd29cd197c.json
similarity index 82%
rename from data/helm_lite/cohere/command/dec04718-1ae9-4e4b-92da-01d789424f69.json
rename to data/helm_lite/cohere/command/ba5eea81-2120-4a20-8322-dfbd29cd197c.json
index b95f59ea4..95909d3aa 100644
--- a/data/helm_lite/cohere/command/dec04718-1ae9-4e4b-92da-01d789424f69.json
+++ b/data/helm_lite/cohere/command/ba5eea81-2120-4a20-8322-dfbd29cd197c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/cohere_command/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/cohere_command/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.327,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/databricks/dbrx-instruct/ba50499a-6cfd-4f04-aab5-c2122202cc74.json b/data/helm_lite/databricks/dbrx-instruct/9dd66ede-da5c-4627-92ed-7057c9a2bea3.json
similarity index 82%
rename from data/helm_lite/databricks/dbrx-instruct/ba50499a-6cfd-4f04-aab5-c2122202cc74.json
rename to data/helm_lite/databricks/dbrx-instruct/9dd66ede-da5c-4627-92ed-7057c9a2bea3.json
index 7cf9a9388..81dc83db8 100644
--- a/data/helm_lite/databricks/dbrx-instruct/ba50499a-6cfd-4f04-aab5-c2122202cc74.json
+++ b/data/helm_lite/databricks/dbrx-instruct/9dd66ede-da5c-4627-92ed-7057c9a2bea3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/databricks_dbrx-instruct/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/databricks_dbrx-instruct/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.289,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/801aa7da-90b2-48d1-ad3d-943b06bd437c.json
similarity index 82%
rename from data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json
rename to data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/801aa7da-90b2-48d1-ad3d-943b06bd437c.json
index bf2730468..31cee265a 100644
--- a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json
+++ b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/801aa7da-90b2-48d1-ad3d-943b06bd437c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "DeepSeek LLM Chat (67B)",
+    "name": "DeepSeek LLM Chat 67B",
     "id": "deepseek-ai/deepseek-llm-67b-chat",
     "developer": "deepseek-ai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.488,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/deepseek-ai/deepseek-v3/d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json b/data/helm_lite/deepseek-ai/deepseek-v3/a58923ea-fa22-4c45-8327-efbe84c8a05d.json
similarity index 82%
rename from data/helm_lite/deepseek-ai/deepseek-v3/d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json
rename to data/helm_lite/deepseek-ai/deepseek-v3/a58923ea-fa22-4c45-8327-efbe84c8a05d.json
index e07480be1..cc64f30ee 100644
--- a/data/helm_lite/deepseek-ai/deepseek-v3/d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json
+++ b/data/helm_lite/deepseek-ai/deepseek-v3/a58923ea-fa22-4c45-8327-efbe84c8a05d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.908,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemini-1.0-pro-002/1e98157d-49e6-4d66-ae21-a95d419c47e3.json b/data/helm_lite/google/gemini-1.0-pro-002/bab8d241-fad0-4230-b213-c2eeccc79f12.json
similarity index 82%
rename from data/helm_lite/google/gemini-1.0-pro-002/1e98157d-49e6-4d66-ae21-a95d419c47e3.json
rename to data/helm_lite/google/gemini-1.0-pro-002/bab8d241-fad0-4230-b213-c2eeccc79f12.json
index eefe2f954..f6af740ee 100644
--- a/data/helm_lite/google/gemini-1.0-pro-002/1e98157d-49e6-4d66-ae21-a95d419c47e3.json
+++ b/data/helm_lite/google/gemini-1.0-pro-002/bab8d241-fad0-4230-b213-c2eeccc79f12.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.0 Pro (002)",
+    "name": "Gemini 1.0 Pro 002",
     "id": "google/gemini-1.0-pro-002",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.422,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemini-1.5-flash-001/e92bce18-690a-44eb-8bc5-28e9303473bb.json b/data/helm_lite/google/gemini-1.5-flash-001/65e37589-ef26-46cd-a627-798af70e75bf.json
similarity index 82%
rename from data/helm_lite/google/gemini-1.5-flash-001/e92bce18-690a-44eb-8bc5-28e9303473bb.json
rename to data/helm_lite/google/gemini-1.5-flash-001/65e37589-ef26-46cd-a627-798af70e75bf.json
index e10645540..7c312bb83 100644
--- a/data/helm_lite/google/gemini-1.5-flash-001/e92bce18-690a-44eb-8bc5-28e9303473bb.json
+++ b/data/helm_lite/google/gemini-1.5-flash-001/65e37589-ef26-46cd-a627-798af70e75bf.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Flash (001)",
+    "name": "Gemini 1.5 Flash 001",
     "id": "google/gemini-1.5-flash-001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.667,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemini-1.5-flash-002/3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json b/data/helm_lite/google/gemini-1.5-flash-002/f499f9c6-4c9a-43ba-b4c3-d094494a371c.json
similarity index 82%
rename from data/helm_lite/google/gemini-1.5-flash-002/3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json
rename to data/helm_lite/google/gemini-1.5-flash-002/f499f9c6-4c9a-43ba-b4c3-d094494a371c.json
index 8e4eb067b..450dbafcb 100644
--- a/data/helm_lite/google/gemini-1.5-flash-002/3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json
+++ b/data/helm_lite/google/gemini-1.5-flash-002/f499f9c6-4c9a-43ba-b4c3-d094494a371c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Flash (002)",
+    "name": "Gemini 1.5 Flash 002",
     "id": "google/gemini-1.5-flash-002",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.573,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemini-1.5-pro-001/b1ecfc78-f59e-437f-b163-9253ad092799.json b/data/helm_lite/google/gemini-1.5-pro-001/27a54446-57b2-4239-b768-7ab85dc94c54.json
similarity index 82%
rename from data/helm_lite/google/gemini-1.5-pro-001/b1ecfc78-f59e-437f-b163-9253ad092799.json
rename to data/helm_lite/google/gemini-1.5-pro-001/27a54446-57b2-4239-b768-7ab85dc94c54.json
index 38c3a236a..653e006ee 100644
--- a/data/helm_lite/google/gemini-1.5-pro-001/b1ecfc78-f59e-437f-b163-9253ad092799.json
+++ b/data/helm_lite/google/gemini-1.5-pro-001/27a54446-57b2-4239-b768-7ab85dc94c54.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Pro (001)",
+    "name": "Gemini 1.5 Pro 001",
     "id": "google/gemini-1.5-pro-001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.739,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemini-1.5-pro-002/04415dda-306f-420c-8af8-54336368fc40.json b/data/helm_lite/google/gemini-1.5-pro-002/5de8a13e-a029-4a90-9a2d-c28a59212140.json
similarity index 82%
rename from data/helm_lite/google/gemini-1.5-pro-002/04415dda-306f-420c-8af8-54336368fc40.json
rename to data/helm_lite/google/gemini-1.5-pro-002/5de8a13e-a029-4a90-9a2d-c28a59212140.json
index cada735aa..64f712478 100644
--- a/data/helm_lite/google/gemini-1.5-pro-002/04415dda-306f-420c-8af8-54336368fc40.json
+++ b/data/helm_lite/google/gemini-1.5-pro-002/5de8a13e-a029-4a90-9a2d-c28a59212140.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Pro (002)",
+    "name": "Gemini 1.5 Pro 002",
     "id": "google/gemini-1.5-pro-002",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.842,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemini-2.0-flash-exp/ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json b/data/helm_lite/google/gemini-2.0-flash-exp/f9643ce2-7347-401b-903e-fadcc5221f36.json
similarity index 82%
rename from data/helm_lite/google/gemini-2.0-flash-exp/ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json
rename to data/helm_lite/google/gemini-2.0-flash-exp/f9643ce2-7347-401b-903e-fadcc5221f36.json
index 1487ce304..371c57f27 100644
--- a/data/helm_lite/google/gemini-2.0-flash-exp/ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json
+++ b/data/helm_lite/google/gemini-2.0-flash-exp/f9643ce2-7347-401b-903e-fadcc5221f36.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 2.0 Flash (Experimental)",
+    "name": "Gemini 2.0 Flash Experimental",
     "id": "google/gemini-2.0-flash-exp",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.813,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,18 +506,27 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -496,10 +571,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemma-2-27b-it/5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json b/data/helm_lite/google/gemma-2-27b-it/9932e430-2039-40b0-bc8f-ae2d833543e8.json
similarity index 82%
rename from data/helm_lite/google/gemma-2-27b-it/5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json
rename to data/helm_lite/google/gemma-2-27b-it/9932e430-2039-40b0-bc8f-ae2d833543e8.json
index 29456a114..24f598da3 100644
--- a/data/helm_lite/google/gemma-2-27b-it/5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json
+++ b/data/helm_lite/google/gemma-2-27b-it/9932e430-2039-40b0-bc8f-ae2d833543e8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemma-2-27b-it/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemma-2-27b-it/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemma 2 Instruct (27B)",
+    "name": "Gemma 2 Instruct 27B",
     "id": "google/gemma-2-27b-it",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.675,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemma-2-9b-it/63af45df-c46d-46df-8f3e-592181ce6a7a.json b/data/helm_lite/google/gemma-2-9b-it/dbd2e9bb-c2ca-4165-b229-d736a70721a5.json
similarity index 82%
rename from data/helm_lite/google/gemma-2-9b-it/63af45df-c46d-46df-8f3e-592181ce6a7a.json
rename to data/helm_lite/google/gemma-2-9b-it/dbd2e9bb-c2ca-4165-b229-d736a70721a5.json
index 75457f70d..1e65ff610 100644
--- a/data/helm_lite/google/gemma-2-9b-it/63af45df-c46d-46df-8f3e-592181ce6a7a.json
+++ b/data/helm_lite/google/gemma-2-9b-it/dbd2e9bb-c2ca-4165-b229-d736a70721a5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemma-2-9b-it/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemma-2-9b-it/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemma 2 Instruct (9B)",
+    "name": "Gemma 2 Instruct 9B",
     "id": "google/gemma-2-9b-it",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.562,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/gemma-7b/aad88f1f-6047-45e7-8b0f-d5deac20be68.json b/data/helm_lite/google/gemma-7b/32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json
similarity index 82%
rename from data/helm_lite/google/gemma-7b/aad88f1f-6047-45e7-8b0f-d5deac20be68.json
rename to data/helm_lite/google/gemma-7b/32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json
index dabc86d10..ee614ce44 100644
--- a/data/helm_lite/google/gemma-7b/aad88f1f-6047-45e7-8b0f-d5deac20be68.json
+++ b/data/helm_lite/google/gemma-7b/32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_gemma-7b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_gemma-7b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemma (7B)",
+    "name": "Gemma 7B",
     "id": "google/gemma-7b",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.336,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/text-bison@001/f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json b/data/helm_lite/google/text-bison@001/70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json
similarity index 82%
rename from data/helm_lite/google/text-bison@001/f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json
rename to data/helm_lite/google/text-bison@001/70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json
index 9c9727ed0..7d7c944f0 100644
--- a/data/helm_lite/google/text-bison@001/f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json
+++ b/data/helm_lite/google/text-bison@001/70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_text-bison@001/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_text-bison@001/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "PaLM-2 (Bison)",
+    "name": "PaLM-2 Bison",
     "id": "google/text-bison@001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.526,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/google/text-unicorn@001/35f70e20-8a08-4f7c-b822-5238337d4177.json b/data/helm_lite/google/text-unicorn@001/07a367ee-2879-4ede-bbf8-33b24d682467.json
similarity index 82%
rename from data/helm_lite/google/text-unicorn@001/35f70e20-8a08-4f7c-b822-5238337d4177.json
rename to data/helm_lite/google/text-unicorn@001/07a367ee-2879-4ede-bbf8-33b24d682467.json
index 2e152e4a7..f19d99b14 100644
--- a/data/helm_lite/google/text-unicorn@001/35f70e20-8a08-4f7c-b822-5238337d4177.json
+++ b/data/helm_lite/google/text-unicorn@001/07a367ee-2879-4ede-bbf8-33b24d682467.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/google_text-unicorn@001/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/google_text-unicorn@001/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "PaLM-2 (Unicorn)",
+    "name": "PaLM-2 Unicorn",
     "id": "google/text-unicorn@001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.644,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-2-13b/e19c56fc-5f6c-48a0-874a-97665283e6f0.json b/data/helm_lite/meta/llama-2-13b/fee914c7-d6bf-4d61-9f50-71bae5f11006.json
similarity index 82%
rename from data/helm_lite/meta/llama-2-13b/e19c56fc-5f6c-48a0-874a-97665283e6f0.json
rename to data/helm_lite/meta/llama-2-13b/fee914c7-d6bf-4d61-9f50-71bae5f11006.json
index a5b394c06..f38e87995 100644
--- a/data/helm_lite/meta/llama-2-13b/e19c56fc-5f6c-48a0-874a-97665283e6f0.json
+++ b/data/helm_lite/meta/llama-2-13b/fee914c7-d6bf-4d61-9f50-71bae5f11006.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-2-13b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-2-13b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 2 (13B)",
+    "name": "Llama 2 13B",
     "id": "meta/llama-2-13b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.233,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-2-70b/98a0c9bb-9679-4cc5-85b8-8801dbb965de.json b/data/helm_lite/meta/llama-2-70b/b0577066-231e-461b-bae8-b724b204397a.json
similarity index 82%
rename from data/helm_lite/meta/llama-2-70b/98a0c9bb-9679-4cc5-85b8-8801dbb965de.json
rename to data/helm_lite/meta/llama-2-70b/b0577066-231e-461b-bae8-b724b204397a.json
index cf4407980..b0d616c29 100644
--- a/data/helm_lite/meta/llama-2-70b/98a0c9bb-9679-4cc5-85b8-8801dbb965de.json
+++ b/data/helm_lite/meta/llama-2-70b/b0577066-231e-461b-bae8-b724b204397a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-2-70b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-2-70b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 2 (70B)",
+    "name": "Llama 2 70B",
     "id": "meta/llama-2-70b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.482,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-2-7b/fad21bfe-048f-412c-b3fd-9b43d276b2a2.json b/data/helm_lite/meta/llama-2-7b/b79fe2e3-5eec-46f8-90a1-810781c8c46a.json
similarity index 82%
rename from data/helm_lite/meta/llama-2-7b/fad21bfe-048f-412c-b3fd-9b43d276b2a2.json
rename to data/helm_lite/meta/llama-2-7b/b79fe2e3-5eec-46f8-90a1-810781c8c46a.json
index 3b18db79e..1e9aac924 100644
--- a/data/helm_lite/meta/llama-2-7b/fad21bfe-048f-412c-b3fd-9b43d276b2a2.json
+++ b/data/helm_lite/meta/llama-2-7b/b79fe2e3-5eec-46f8-90a1-810781c8c46a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-2-7b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-2-7b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 2 (7B)",
+    "name": "Llama 2 7B",
     "id": "meta/llama-2-7b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.152,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3-70b/b1e28406-d88d-4acd-a268-7baebc9b565a.json b/data/helm_lite/meta/llama-3-70b/998616ef-5d1b-4c65-b6ad-23afc3630d5a.json
similarity index 82%
rename from data/helm_lite/meta/llama-3-70b/b1e28406-d88d-4acd-a268-7baebc9b565a.json
rename to data/helm_lite/meta/llama-3-70b/998616ef-5d1b-4c65-b6ad-23afc3630d5a.json
index 90d04801d..f13ee8122 100644
--- a/data/helm_lite/meta/llama-3-70b/b1e28406-d88d-4acd-a268-7baebc9b565a.json
+++ b/data/helm_lite/meta/llama-3-70b/998616ef-5d1b-4c65-b6ad-23afc3630d5a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3-70b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3-70b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3 (70B)",
+    "name": "Llama 3 70B",
     "id": "meta/llama-3-70b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.793,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3-8b/60696eaf-669d-49bf-bebe-6cd171522faa.json b/data/helm_lite/meta/llama-3-8b/fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json
similarity index 82%
rename from data/helm_lite/meta/llama-3-8b/60696eaf-669d-49bf-bebe-6cd171522faa.json
rename to data/helm_lite/meta/llama-3-8b/fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json
index 0e3ff704d..7f18bf5fd 100644
--- a/data/helm_lite/meta/llama-3-8b/60696eaf-669d-49bf-bebe-6cd171522faa.json
+++ b/data/helm_lite/meta/llama-3-8b/fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3-8b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3-8b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3 (8B)",
+    "name": "Llama 3 8B",
     "id": "meta/llama-3-8b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.387,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/25fde5e6-86b8-4a80-8f79-5946ef9999fc.json
similarity index 82%
rename from data/helm_lite/meta/llama-3.1-405b-instruct-turbo/ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json
rename to data/helm_lite/meta/llama-3.1-405b-instruct-turbo/25fde5e6-86b8-4a80-8f79-5946ef9999fc.json
index 8311edd73..6ef4300e5 100644
--- a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json
+++ b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/25fde5e6-86b8-4a80-8f79-5946ef9999fc.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (405B)",
+    "name": "Llama 3.1 Instruct Turbo 405B",
     "id": "meta/llama-3.1-405b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.854,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/c3b72d96-9af5-4e32-b420-e85a88e82e5a.json b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/b955825d-ae7f-48c4-9dad-5ee78879737d.json
similarity index 82%
rename from data/helm_lite/meta/llama-3.1-70b-instruct-turbo/c3b72d96-9af5-4e32-b420-e85a88e82e5a.json
rename to data/helm_lite/meta/llama-3.1-70b-instruct-turbo/b955825d-ae7f-48c4-9dad-5ee78879737d.json
index 3e59bea75..8afc05c39 100644
--- a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/c3b72d96-9af5-4e32-b420-e85a88e82e5a.json
+++ b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/b955825d-ae7f-48c4-9dad-5ee78879737d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (70B)",
+    "name": "Llama 3.1 Instruct Turbo 70B",
     "id": "meta/llama-3.1-70b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.808,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/57b2177d-0232-41ca-aa3a-b2ecb7af7586.json b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json
similarity index 82%
rename from data/helm_lite/meta/llama-3.1-8b-instruct-turbo/57b2177d-0232-41ca-aa3a-b2ecb7af7586.json
rename to data/helm_lite/meta/llama-3.1-8b-instruct-turbo/168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json
index 300f5dbb2..e5cc6d55a 100644
--- a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/57b2177d-0232-41ca-aa3a-b2ecb7af7586.json
+++ b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (8B)",
+    "name": "Llama 3.1 Instruct Turbo 8B",
     "id": "meta/llama-3.1-8b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.303,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/6ed32ce2-18e5-4d1b-94f8-443f81892275.json b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/0807e353-9787-4ca0-8f7b-50d1bed2469e.json
similarity index 82%
rename from data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/6ed32ce2-18e5-4d1b-94f8-443f81892275.json
rename to data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/0807e353-9787-4ca0-8f7b-50d1bed2469e.json
index 4daa7f500..793304d91 100644
--- a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/6ed32ce2-18e5-4d1b-94f8-443f81892275.json
+++ b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/0807e353-9787-4ca0-8f7b-50d1bed2469e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.2 Vision Instruct Turbo (11B)",
+    "name": "Llama 3.2 Vision Instruct Turbo 11B",
     "id": "meta/llama-3.2-11b-vision-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.325,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/5c11f938-7933-45ae-8530-05dac1012f10.json b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/0164b885-2c27-4eba-8e6f-e69156cb0dee.json
similarity index 82%
rename from data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/5c11f938-7933-45ae-8530-05dac1012f10.json
rename to data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/0164b885-2c27-4eba-8e6f-e69156cb0dee.json
index 17f50b1c8..615526ba3 100644
--- a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/5c11f938-7933-45ae-8530-05dac1012f10.json
+++ b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/0164b885-2c27-4eba-8e6f-e69156cb0dee.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.2 Vision Instruct Turbo (90B)",
+    "name": "Llama 3.2 Vision Instruct Turbo 90B",
     "id": "meta/llama-3.2-90b-vision-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.819,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/2b9e00e5-15e1-45ea-a345-32a3d84460fb.json b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/08422837-51a0-45c9-9004-fc5d98dce462.json
similarity index 82%
rename from data/helm_lite/meta/llama-3.3-70b-instruct-turbo/2b9e00e5-15e1-45ea-a345-32a3d84460fb.json
rename to data/helm_lite/meta/llama-3.3-70b-instruct-turbo/08422837-51a0-45c9-9004-fc5d98dce462.json
index 06851628a..8116db8ba 100644
--- a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/2b9e00e5-15e1-45ea-a345-32a3d84460fb.json
+++ b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/08422837-51a0-45c9-9004-fc5d98dce462.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.3 Instruct Turbo (70B)",
+    "name": "Llama 3.3 Instruct Turbo 70B",
     "id": "meta/llama-3.3-70b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.812,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/meta/llama-65b/3e27a5c3-a752-4790-b219-5964331e40ac.json b/data/helm_lite/meta/llama-65b/39f2c7f2-56d4-4349-95ae-374d34263f48.json
similarity index 82%
rename from data/helm_lite/meta/llama-65b/3e27a5c3-a752-4790-b219-5964331e40ac.json
rename to data/helm_lite/meta/llama-65b/39f2c7f2-56d4-4349-95ae-374d34263f48.json
index 624d96ab6..f3354af37 100644
--- a/data/helm_lite/meta/llama-65b/3e27a5c3-a752-4790-b219-5964331e40ac.json
+++ b/data/helm_lite/meta/llama-65b/39f2c7f2-56d4-4349-95ae-374d34263f48.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/meta_llama-65b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/meta_llama-65b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "LLaMA (65B)",
+    "name": "LLaMA 65B",
     "id": "meta/llama-65b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.345,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/microsoft/phi-2/061081c1-6044-40ec-b4a7-1668b8f3ba4f.json b/data/helm_lite/microsoft/phi-2/0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json
similarity index 82%
rename from data/helm_lite/microsoft/phi-2/061081c1-6044-40ec-b4a7-1668b8f3ba4f.json
rename to data/helm_lite/microsoft/phi-2/0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json
index 42e0ca1f2..172c44cac 100644
--- a/data/helm_lite/microsoft/phi-2/061081c1-6044-40ec-b4a7-1668b8f3ba4f.json
+++ b/data/helm_lite/microsoft/phi-2/0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/microsoft_phi-2/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/microsoft_phi-2/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.169,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/33df0ce7-048b-4a1b-816c-a6221afe41de.json b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json
similarity index 82%
rename from data/helm_lite/microsoft/phi-3-medium-4k-instruct/33df0ce7-048b-4a1b-816c-a6221afe41de.json
rename to data/helm_lite/microsoft/phi-3-medium-4k-instruct/75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json
index 40407df59..c613f7fec 100644
--- a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/33df0ce7-048b-4a1b-816c-a6221afe41de.json
+++ b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Phi-3 (14B)",
+    "name": "Phi-3 14B",
     "id": "microsoft/phi-3-medium-4k-instruct",
     "developer": "microsoft",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.509,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/microsoft/phi-3-small-8k-instruct/a3f47cc2-0563-4285-b777-0fcc3c642249.json b/data/helm_lite/microsoft/phi-3-small-8k-instruct/2de4b89a-3f3b-4d1d-ba85-030953a46956.json
similarity index 82%
rename from data/helm_lite/microsoft/phi-3-small-8k-instruct/a3f47cc2-0563-4285-b777-0fcc3c642249.json
rename to data/helm_lite/microsoft/phi-3-small-8k-instruct/2de4b89a-3f3b-4d1d-ba85-030953a46956.json
index 4a88d2532..f78b3f049 100644
--- a/data/helm_lite/microsoft/phi-3-small-8k-instruct/a3f47cc2-0563-4285-b777-0fcc3c642249.json
+++ b/data/helm_lite/microsoft/phi-3-small-8k-instruct/2de4b89a-3f3b-4d1d-ba85-030953a46956.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Phi-3 (7B)",
+    "name": "Phi-3 7B",
     "id": "microsoft/phi-3-small-8k-instruct",
     "developer": "microsoft",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.473,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/067ef4d7-387c-4c09-a1c4-a10af69811f0.json b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/bd68405f-fe9a-448b-9c80-468c656594e5.json
similarity index 82%
rename from data/helm_lite/mistralai/mistral-7b-instruct-v0.3/067ef4d7-387c-4c09-a1c4-a10af69811f0.json
rename to data/helm_lite/mistralai/mistral-7b-instruct-v0.3/bd68405f-fe9a-448b-9c80-468c656594e5.json
index 81cb62772..97f8b3a1e 100644
--- a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/067ef4d7-387c-4c09-a1c4-a10af69811f0.json
+++ b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/bd68405f-fe9a-448b-9c80-468c656594e5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Instruct v0.3 (7B)",
+    "name": "Mistral Instruct v0.3 7B",
     "id": "mistralai/mistral-7b-instruct-v0.3",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.196,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mistral-7b-v0.1/0a07f39c-745a-46c3-ad11-c79a50cc18bb.json b/data/helm_lite/mistralai/mistral-7b-v0.1/4267fef1-3180-46e3-990e-0d1092ec4c18.json
similarity index 82%
rename from data/helm_lite/mistralai/mistral-7b-v0.1/0a07f39c-745a-46c3-ad11-c79a50cc18bb.json
rename to data/helm_lite/mistralai/mistral-7b-v0.1/4267fef1-3180-46e3-990e-0d1092ec4c18.json
index 17ebd8348..30337d5a4 100644
--- a/data/helm_lite/mistralai/mistral-7b-v0.1/0a07f39c-745a-46c3-ad11-c79a50cc18bb.json
+++ b/data/helm_lite/mistralai/mistral-7b-v0.1/4267fef1-3180-46e3-990e-0d1092ec4c18.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral v0.1 (7B)",
+    "name": "Mistral v0.1 7B",
     "id": "mistralai/mistral-7b-v0.1",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.292,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mistral-large-2402/35797854-d46a-4646-94a2-3acf1d484418.json b/data/helm_lite/mistralai/mistral-large-2402/002a34dc-39e5-451d-b2a8-b51bdb69a056.json
similarity index 82%
rename from data/helm_lite/mistralai/mistral-large-2402/35797854-d46a-4646-94a2-3acf1d484418.json
rename to data/helm_lite/mistralai/mistral-large-2402/002a34dc-39e5-451d-b2a8-b51bdb69a056.json
index ca506f27c..edea4050d 100644
--- a/data/helm_lite/mistralai/mistral-large-2402/35797854-d46a-4646-94a2-3acf1d484418.json
+++ b/data/helm_lite/mistralai/mistral-large-2402/002a34dc-39e5-451d-b2a8-b51bdb69a056.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Large (2402)",
+    "name": "Mistral Large 2402",
     "id": "mistralai/mistral-large-2402",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.328,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mistral-large-2407/3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json b/data/helm_lite/mistralai/mistral-large-2407/5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json
similarity index 82%
rename from data/helm_lite/mistralai/mistral-large-2407/3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json
rename to data/helm_lite/mistralai/mistral-large-2407/5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json
index a10172374..d2dd06c67 100644
--- a/data/helm_lite/mistralai/mistral-large-2407/3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json
+++ b/data/helm_lite/mistralai/mistral-large-2407/5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Large 2 (2407)",
+    "name": "Mistral Large 2 2407",
     "id": "mistralai/mistral-large-2407",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.744,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mistral-medium-2312/33bd2b4e-0292-47b7-84de-de6ff5804257.json b/data/helm_lite/mistralai/mistral-medium-2312/ad2beded-cec3-4b47-b8de-a32a3225fa66.json
similarity index 82%
rename from data/helm_lite/mistralai/mistral-medium-2312/33bd2b4e-0292-47b7-84de-de6ff5804257.json
rename to data/helm_lite/mistralai/mistral-medium-2312/ad2beded-cec3-4b47-b8de-a32a3225fa66.json
index 966d4c393..cbbf76044 100644
--- a/data/helm_lite/mistralai/mistral-medium-2312/33bd2b4e-0292-47b7-84de-de6ff5804257.json
+++ b/data/helm_lite/mistralai/mistral-medium-2312/ad2beded-cec3-4b47-b8de-a32a3225fa66.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Medium (2312)",
+    "name": "Mistral Medium 2312",
     "id": "mistralai/mistral-medium-2312",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.268,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mistral-small-2402/67edb54d-efed-4a23-97ef-6d2a9f254ae1.json b/data/helm_lite/mistralai/mistral-small-2402/eb901347-fc1f-4d8f-a70a-05a83e16658d.json
similarity index 82%
rename from data/helm_lite/mistralai/mistral-small-2402/67edb54d-efed-4a23-97ef-6d2a9f254ae1.json
rename to data/helm_lite/mistralai/mistral-small-2402/eb901347-fc1f-4d8f-a70a-05a83e16658d.json
index 039a9d5cc..d1c6bf6d0 100644
--- a/data/helm_lite/mistralai/mistral-small-2402/67edb54d-efed-4a23-97ef-6d2a9f254ae1.json
+++ b/data/helm_lite/mistralai/mistral-small-2402/eb901347-fc1f-4d8f-a70a-05a83e16658d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Small (2402)",
+    "name": "Mistral Small 2402",
     "id": "mistralai/mistral-small-2402",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.288,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mixtral-8x22b/ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json b/data/helm_lite/mistralai/mixtral-8x22b/9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json
similarity index 82%
rename from data/helm_lite/mistralai/mixtral-8x22b/ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json
rename to data/helm_lite/mistralai/mixtral-8x22b/9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json
index 781bbb2c8..d020ec2ef 100644
--- a/data/helm_lite/mistralai/mixtral-8x22b/ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json
+++ b/data/helm_lite/mistralai/mixtral-8x22b/9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mixtral (8x22B)",
+    "name": "Mixtral 8x22B",
     "id": "mistralai/mixtral-8x22b",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.705,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/469d069f-581e-415c-9c9d-f57e7c972da5.json b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json
similarity index 82%
rename from data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/469d069f-581e-415c-9c9d-f57e7c972da5.json
rename to data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json
index 818a4bd2a..ca92e5358 100644
--- a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/469d069f-581e-415c-9c9d-f57e7c972da5.json
+++ b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mixtral (8x7B 32K seqlen)",
+    "name": "Mixtral 8x7B 32K seqlen",
     "id": "mistralai/mixtral-8x7b-32kseqlen",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.51,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/mistralai/open-mistral-nemo-2407/c9a3f927-041f-47cf-ae02-03fe4be0a59e.json b/data/helm_lite/mistralai/open-mistral-nemo-2407/d2d48e4a-0484-4f44-8108-2e689d7ca695.json
similarity index 82%
rename from data/helm_lite/mistralai/open-mistral-nemo-2407/c9a3f927-041f-47cf-ae02-03fe4be0a59e.json
rename to data/helm_lite/mistralai/open-mistral-nemo-2407/d2d48e4a-0484-4f44-8108-2e689d7ca695.json
index dfc851db9..75b65c3cd 100644
--- a/data/helm_lite/mistralai/open-mistral-nemo-2407/c9a3f927-041f-47cf-ae02-03fe4be0a59e.json
+++ b/data/helm_lite/mistralai/open-mistral-nemo-2407/d2d48e4a-0484-4f44-8108-2e689d7ca695.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral NeMo (2402)",
+    "name": "Mistral NeMo 2402",
     "id": "mistralai/open-mistral-nemo-2407",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.333,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-3.5-turbo-0613/1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json b/data/helm_lite/openai/gpt-3.5-turbo-0613/e54ae605-a91d-47d7-a08d-67bd0ea5c606.json
similarity index 82%
rename from data/helm_lite/openai/gpt-3.5-turbo-0613/1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json
rename to data/helm_lite/openai/gpt-3.5-turbo-0613/e54ae605-a91d-47d7-a08d-67bd0ea5c606.json
index 28acf453d..c3db66d6f 100644
--- a/data/helm_lite/openai/gpt-3.5-turbo-0613/1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json
+++ b/data/helm_lite/openai/gpt-3.5-turbo-0613/e54ae605-a91d-47d7-a08d-67bd0ea5c606.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-3.5 Turbo (0613)",
+    "name": "GPT-3.5 Turbo 0613",
     "id": "openai/gpt-3.5-turbo-0613",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.358,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-4-0613/4e58fdd9-e14c-441a-a9fb-4c525a615880.json b/data/helm_lite/openai/gpt-4-0613/15dccf75-871d-457b-8495-e0d03d550360.json
similarity index 82%
rename from data/helm_lite/openai/gpt-4-0613/4e58fdd9-e14c-441a-a9fb-4c525a615880.json
rename to data/helm_lite/openai/gpt-4-0613/15dccf75-871d-457b-8495-e0d03d550360.json
index 6fa2534b1..3b34bbe84 100644
--- a/data/helm_lite/openai/gpt-4-0613/4e58fdd9-e14c-441a-a9fb-4c525a615880.json
+++ b/data/helm_lite/openai/gpt-4-0613/15dccf75-871d-457b-8495-e0d03d550360.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-4-0613/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-4-0613/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 (0613)",
+    "name": "GPT-4 0613",
     "id": "openai/gpt-4-0613",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.867,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-4-1106-preview/252ec309-9b98-463e-aee4-6e28deb6dcfb.json b/data/helm_lite/openai/gpt-4-1106-preview/18fe5d30-bf36-405a-819e-1ecabda327ea.json
similarity index 82%
rename from data/helm_lite/openai/gpt-4-1106-preview/252ec309-9b98-463e-aee4-6e28deb6dcfb.json
rename to data/helm_lite/openai/gpt-4-1106-preview/18fe5d30-bf36-405a-819e-1ecabda327ea.json
index c0d921b54..f80298de5 100644
--- a/data/helm_lite/openai/gpt-4-1106-preview/252ec309-9b98-463e-aee4-6e28deb6dcfb.json
+++ b/data/helm_lite/openai/gpt-4-1106-preview/18fe5d30-bf36-405a-819e-1ecabda327ea.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 Turbo (1106 preview)",
+    "name": "GPT-4 Turbo 1106 preview",
     "id": "openai/gpt-4-1106-preview",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.698,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/5530c426-2321-4aa3-b860-f9b764b7b748.json b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json
similarity index 82%
rename from data/helm_lite/openai/gpt-4-turbo-2024-04-09/5530c426-2321-4aa3-b860-f9b764b7b748.json
rename to data/helm_lite/openai/gpt-4-turbo-2024-04-09/cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json
index 599344447..49bdd419a 100644
--- a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/5530c426-2321-4aa3-b860-f9b764b7b748.json
+++ b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 Turbo (2024-04-09)",
+    "name": "GPT-4 Turbo 2024-04-09",
     "id": "openai/gpt-4-turbo-2024-04-09",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.864,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-4o-2024-05-13/da92cfe0-b066-416a-9408-3eb9d36b9fb3.json b/data/helm_lite/openai/gpt-4o-2024-05-13/cd199905-04a4-4745-b848-4f7bde97ca17.json
similarity index 82%
rename from data/helm_lite/openai/gpt-4o-2024-05-13/da92cfe0-b066-416a-9408-3eb9d36b9fb3.json
rename to data/helm_lite/openai/gpt-4o-2024-05-13/cd199905-04a4-4745-b848-4f7bde97ca17.json
index 98feb8bc0..ab2f778b6 100644
--- a/data/helm_lite/openai/gpt-4o-2024-05-13/da92cfe0-b066-416a-9408-3eb9d36b9fb3.json
+++ b/data/helm_lite/openai/gpt-4o-2024-05-13/cd199905-04a4-4745-b848-4f7bde97ca17.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o (2024-05-13)",
+    "name": "GPT-4o 2024-05-13",
     "id": "openai/gpt-4o-2024-05-13",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.938,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-4o-2024-08-06/2a752701-a826-4316-b3eb-e9eec90a5a89.json b/data/helm_lite/openai/gpt-4o-2024-08-06/1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json
similarity index 82%
rename from data/helm_lite/openai/gpt-4o-2024-08-06/2a752701-a826-4316-b3eb-e9eec90a5a89.json
rename to data/helm_lite/openai/gpt-4o-2024-08-06/1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json
index cb595e51b..3d286d830 100644
--- a/data/helm_lite/openai/gpt-4o-2024-08-06/2a752701-a826-4316-b3eb-e9eec90a5a89.json
+++ b/data/helm_lite/openai/gpt-4o-2024-08-06/1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o (2024-08-06)",
+    "name": "GPT-4o 2024-08-06",
     "id": "openai/gpt-4o-2024-08-06",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.928,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bea4af4b-8155-4784-9192-b40270d574af.json b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bfd70aff-bf45-4f55-b730-4924afc181cd.json
similarity index 82%
rename from data/helm_lite/openai/gpt-4o-mini-2024-07-18/bea4af4b-8155-4784-9192-b40270d574af.json
rename to data/helm_lite/openai/gpt-4o-mini-2024-07-18/bfd70aff-bf45-4f55-b730-4924afc181cd.json
index 3fb056373..53ecaa7dc 100644
--- a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bea4af4b-8155-4784-9192-b40270d574af.json
+++ b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bfd70aff-bf45-4f55-b730-4924afc181cd.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o mini (2024-07-18)",
+    "name": "GPT-4o mini 2024-07-18",
     "id": "openai/gpt-4o-mini-2024-07-18",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.701,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/text-davinci-002/d08eccd1-602c-4d64-a487-2d9c028459a0.json b/data/helm_lite/openai/text-davinci-002/b6e08679-1bd7-42a1-9eee-98252de2c7c1.json
similarity index 82%
rename from data/helm_lite/openai/text-davinci-002/d08eccd1-602c-4d64-a487-2d9c028459a0.json
rename to data/helm_lite/openai/text-davinci-002/b6e08679-1bd7-42a1-9eee-98252de2c7c1.json
index d390f5b2a..c90d2c5a2 100644
--- a/data/helm_lite/openai/text-davinci-002/d08eccd1-602c-4d64-a487-2d9c028459a0.json
+++ b/data/helm_lite/openai/text-davinci-002/b6e08679-1bd7-42a1-9eee-98252de2c7c1.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_text-davinci-002/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_text-davinci-002/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-3.5 (text-davinci-002)",
+    "name": "GPT-3.5 text-davinci-002",
     "id": "openai/text-davinci-002",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.336,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/openai/text-davinci-003/3cceb22d-7ce9-49a1-a677-548a97c52970.json b/data/helm_lite/openai/text-davinci-003/22b411d5-a314-4b17-a9c7-c1af7ca7df33.json
similarity index 82%
rename from data/helm_lite/openai/text-davinci-003/3cceb22d-7ce9-49a1-a677-548a97c52970.json
rename to data/helm_lite/openai/text-davinci-003/22b411d5-a314-4b17-a9c7-c1af7ca7df33.json
index 99961f779..6f2c648e1 100644
--- a/data/helm_lite/openai/text-davinci-003/3cceb22d-7ce9-49a1-a677-548a97c52970.json
+++ b/data/helm_lite/openai/text-davinci-003/22b411d5-a314-4b17-a9c7-c1af7ca7df33.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/openai_text-davinci-003/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/openai_text-davinci-003/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-3.5 (text-davinci-003)",
+    "name": "GPT-3.5 text-davinci-003",
     "id": "openai/text-davinci-003",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.439,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen1.5-110b-chat/6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json b/data/helm_lite/qwen/qwen1.5-110b-chat/f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json
similarity index 82%
rename from data/helm_lite/qwen/qwen1.5-110b-chat/6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json
rename to data/helm_lite/qwen/qwen1.5-110b-chat/f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json
index 6aed691a1..3b85e6b08 100644
--- a/data/helm_lite/qwen/qwen1.5-110b-chat/6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json
+++ b/data/helm_lite/qwen/qwen1.5-110b-chat/f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 Chat (110B)",
+    "name": "Qwen1.5 Chat 110B",
     "id": "qwen/qwen1.5-110b-chat",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.55,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen1.5-14b/9b1ee735-bc25-48fd-94cd-24f17edcdc21.json b/data/helm_lite/qwen/qwen1.5-14b/fb1bb023-16f6-4914-889b-6458d7ab1277.json
similarity index 82%
rename from data/helm_lite/qwen/qwen1.5-14b/9b1ee735-bc25-48fd-94cd-24f17edcdc21.json
rename to data/helm_lite/qwen/qwen1.5-14b/fb1bb023-16f6-4914-889b-6458d7ab1277.json
index f6c7858eb..4df79c00f 100644
--- a/data/helm_lite/qwen/qwen1.5-14b/9b1ee735-bc25-48fd-94cd-24f17edcdc21.json
+++ b/data/helm_lite/qwen/qwen1.5-14b/fb1bb023-16f6-4914-889b-6458d7ab1277.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (14B)",
+    "name": "Qwen1.5 14B",
     "id": "qwen/qwen1.5-14b",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.425,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen1.5-32b/a648cb90-bcce-4171-a664-df0b19304833.json b/data/helm_lite/qwen/qwen1.5-32b/8b572c10-3553-4e51-a321-bdb05996914b.json
similarity index 82%
rename from data/helm_lite/qwen/qwen1.5-32b/a648cb90-bcce-4171-a664-df0b19304833.json
rename to data/helm_lite/qwen/qwen1.5-32b/8b572c10-3553-4e51-a321-bdb05996914b.json
index 1314aa204..74a01181c 100644
--- a/data/helm_lite/qwen/qwen1.5-32b/a648cb90-bcce-4171-a664-df0b19304833.json
+++ b/data/helm_lite/qwen/qwen1.5-32b/8b572c10-3553-4e51-a321-bdb05996914b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (32B)",
+    "name": "Qwen1.5 32B",
     "id": "qwen/qwen1.5-32b",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.546,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen1.5-72b/5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json b/data/helm_lite/qwen/qwen1.5-72b/6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json
similarity index 82%
rename from data/helm_lite/qwen/qwen1.5-72b/5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json
rename to data/helm_lite/qwen/qwen1.5-72b/6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json
index 6da42bd5b..a056d0e42 100644
--- a/data/helm_lite/qwen/qwen1.5-72b/5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json
+++ b/data/helm_lite/qwen/qwen1.5-72b/6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (72B)",
+    "name": "Qwen1.5 72B",
     "id": "qwen/qwen1.5-72b",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.608,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen1.5-7b/71d69629-11b9-4f06-98ca-536f1ab22f2c.json b/data/helm_lite/qwen/qwen1.5-7b/e0efe169-d28e-418e-a78c-9b04ec29aae2.json
similarity index 82%
rename from data/helm_lite/qwen/qwen1.5-7b/71d69629-11b9-4f06-98ca-536f1ab22f2c.json
rename to data/helm_lite/qwen/qwen1.5-7b/e0efe169-d28e-418e-a78c-9b04ec29aae2.json
index a4d0226b9..0757d65b1 100644
--- a/data/helm_lite/qwen/qwen1.5-7b/71d69629-11b9-4f06-98ca-536f1ab22f2c.json
+++ b/data/helm_lite/qwen/qwen1.5-7b/e0efe169-d28e-418e-a78c-9b04ec29aae2.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (7B)",
+    "name": "Qwen1.5 7B",
     "id": "qwen/qwen1.5-7b",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.275,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen2-72b-instruct/a594b434-eeb2-41f5-b23d-eea23ed2add2.json b/data/helm_lite/qwen/qwen2-72b-instruct/05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json
similarity index 82%
rename from data/helm_lite/qwen/qwen2-72b-instruct/a594b434-eeb2-41f5-b23d-eea23ed2add2.json
rename to data/helm_lite/qwen/qwen2-72b-instruct/05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json
index 4e8665e6b..2d8d0469d 100644
--- a/data/helm_lite/qwen/qwen2-72b-instruct/a594b434-eeb2-41f5-b23d-eea23ed2add2.json
+++ b/data/helm_lite/qwen/qwen2-72b-instruct/05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2 Instruct (72B)",
+    "name": "Qwen2 Instruct 72B",
     "id": "qwen/qwen2-72b-instruct",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.77,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/e6a833e5-6b86-4d32-be03-010fdfde3ffc.json b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/983696ae-d7f3-48a4-b7a0-a42487728182.json
similarity index 82%
rename from data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/e6a833e5-6b86-4d32-be03-010fdfde3ffc.json
rename to data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/983696ae-d7f3-48a4-b7a0-a42487728182.json
index 9e7699d4b..6091d879d 100644
--- a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/e6a833e5-6b86-4d32-be03-010fdfde3ffc.json
+++ b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/983696ae-d7f3-48a4-b7a0-a42487728182.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2.5 Instruct Turbo (72B)",
+    "name": "Qwen2.5 Instruct Turbo 72B",
     "id": "qwen/qwen2.5-72b-instruct-turbo",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.745,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,18 +506,27 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -496,10 +571,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/a969e516-adef-4839-9252-244c58ab3c67.json
similarity index 82%
rename from data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json
rename to data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/a969e516-adef-4839-9252-244c58ab3c67.json
index 126ae4e72..a9b9ae2a3 100644
--- a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json
+++ b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/a969e516-adef-4839-9252-244c58ab3c67.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2.5 Instruct Turbo (7B)",
+    "name": "Qwen2.5 Instruct Turbo 7B",
     "id": "qwen/qwen2.5-7b-instruct-turbo",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.488,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,18 +506,27 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -496,10 +571,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -545,13 +629,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/snowflake/snowflake-arctic-instruct/2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json b/data/helm_lite/snowflake/snowflake-arctic-instruct/f122f9de-b1ce-40ea-8731-6c00c7af0498.json
similarity index 82%
rename from data/helm_lite/snowflake/snowflake-arctic-instruct/2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json
rename to data/helm_lite/snowflake/snowflake-arctic-instruct/f122f9de-b1ce-40ea-8731-6c00c7af0498.json
index a52059819..f7f93c913 100644
--- a/data/helm_lite/snowflake/snowflake-arctic-instruct/2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json
+++ b/data/helm_lite/snowflake/snowflake-arctic-instruct/f122f9de-b1ce-40ea-8731-6c00c7af0498.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.338,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/tiiuae/falcon-40b/346c2a85-3daf-41e9-9305-78851dcf05ae.json b/data/helm_lite/tiiuae/falcon-40b/5c7982c5-3513-4ff2-9857-33a0db825376.json
similarity index 82%
rename from data/helm_lite/tiiuae/falcon-40b/346c2a85-3daf-41e9-9305-78851dcf05ae.json
rename to data/helm_lite/tiiuae/falcon-40b/5c7982c5-3513-4ff2-9857-33a0db825376.json
index 518458e37..65a14de91 100644
--- a/data/helm_lite/tiiuae/falcon-40b/346c2a85-3daf-41e9-9305-78851dcf05ae.json
+++ b/data/helm_lite/tiiuae/falcon-40b/5c7982c5-3513-4ff2-9857-33a0db825376.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/tiiuae_falcon-40b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/tiiuae_falcon-40b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Falcon (40B)",
+    "name": "Falcon 40B",
     "id": "tiiuae/falcon-40b",
     "developer": "tiiuae",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.217,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/tiiuae/falcon-7b/69e02d7b-d536-4ff4-a58e-b880ff87f357.json b/data/helm_lite/tiiuae/falcon-7b/4910859a-750c-4728-bf30-309e0e81690e.json
similarity index 82%
rename from data/helm_lite/tiiuae/falcon-7b/69e02d7b-d536-4ff4-a58e-b880ff87f357.json
rename to data/helm_lite/tiiuae/falcon-7b/4910859a-750c-4728-bf30-309e0e81690e.json
index 4a1515414..62d1fae1c 100644
--- a/data/helm_lite/tiiuae/falcon-7b/69e02d7b-d536-4ff4-a58e-b880ff87f357.json
+++ b/data/helm_lite/tiiuae/falcon-7b/4910859a-750c-4728-bf30-309e0e81690e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/tiiuae_falcon-7b/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/tiiuae_falcon-7b/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Falcon (7B)",
+    "name": "Falcon 7B",
     "id": "tiiuae/falcon-7b",
     "developer": "tiiuae",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.064,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/upstage/solar-pro-241126/3286a69f-cdba-49a5-939a-e14ad759e7a4.json b/data/helm_lite/upstage/solar-pro-241126/32f0532f-b504-492d-84d7-f541930edad0.json
similarity index 82%
rename from data/helm_lite/upstage/solar-pro-241126/3286a69f-cdba-49a5-939a-e14ad759e7a4.json
rename to data/helm_lite/upstage/solar-pro-241126/32f0532f-b504-492d-84d7-f541930edad0.json
index fd33bd463..9e56dbbb6 100644
--- a/data/helm_lite/upstage/solar-pro-241126/3286a69f-cdba-49a5-939a-e14ad759e7a4.json
+++ b/data/helm_lite/upstage/solar-pro-241126/32f0532f-b504-492d-84d7-f541930edad0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/upstage_solar-pro-241126/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/upstage_solar-pro-241126/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.602,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -390,11 +447,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -440,17 +506,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -495,10 +570,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -544,13 +628,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/writer/palmyra-x-004/b798adc1-01f0-46c5-95a4-8b67199d624b.json b/data/helm_lite/writer/palmyra-x-004/04c187a3-4532-4523-b39d-19314d61c779.json
similarity index 82%
rename from data/helm_lite/writer/palmyra-x-004/b798adc1-01f0-46c5-95a4-8b67199d624b.json
rename to data/helm_lite/writer/palmyra-x-004/04c187a3-4532-4523-b39d-19314d61c779.json
index 574c20cd8..2b000451d 100644
--- a/data/helm_lite/writer/palmyra-x-004/b798adc1-01f0-46c5-95a4-8b67199d624b.json
+++ b/data/helm_lite/writer/palmyra-x-004/04c187a3-4532-4523-b39d-19314d61c779.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/writer_palmyra-x-004/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/writer_palmyra-x-004/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.808,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -93,11 +105,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -173,12 +194,21 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook",
-        "stop": "none"
+        "additional_details": {
+          "mode": "closedbook",
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -224,12 +254,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -275,18 +314,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -332,23 +380,32 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True",
-        "stop": "none"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True",
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -394,11 +451,20 @@
         }
       },
       "generation_config": {
-        "stop": "none"
+        "additional_details": {
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -444,18 +510,27 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ],
+          "stop": "none"
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -500,10 +575,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -549,14 +633,16 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ],
-        "stop": "none"
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ],
+          "stop": "none"
+        }
       }
     }
   ]
diff --git a/data/helm_lite/writer/palmyra-x-v2/7a07a202-aa88-47fc-987d-6d44a57b6985.json b/data/helm_lite/writer/palmyra-x-v2/4440532c-9b49-4c9a-8bf4-f122531c54fa.json
similarity index 82%
rename from data/helm_lite/writer/palmyra-x-v2/7a07a202-aa88-47fc-987d-6d44a57b6985.json
rename to data/helm_lite/writer/palmyra-x-v2/4440532c-9b49-4c9a-8bf4-f122531c54fa.json
index 85f887f2f..fc600d1dc 100644
--- a/data/helm_lite/writer/palmyra-x-v2/7a07a202-aa88-47fc-987d-6d44a57b6985.json
+++ b/data/helm_lite/writer/palmyra-x-v2/4440532c-9b49-4c9a-8bf4-f122531c54fa.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/writer_palmyra-x-v2/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/writer_palmyra-x-v2/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Palmyra X V2 (33B)",
+    "name": "Palmyra X V2 33B",
     "id": "writer/palmyra-x-v2",
     "developer": "writer",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.589,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_lite/writer/palmyra-x-v3/ac0a7249-11e7-493d-9190-8c1913bb1c42.json b/data/helm_lite/writer/palmyra-x-v3/bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json
similarity index 82%
rename from data/helm_lite/writer/palmyra-x-v3/ac0a7249-11e7-493d-9190-8c1913bb1c42.json
rename to data/helm_lite/writer/palmyra-x-v3/bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json
index ae69f6c5b..3ac2641c0 100644
--- a/data/helm_lite/writer/palmyra-x-v3/ac0a7249-11e7-493d-9190-8c1913bb1c42.json
+++ b/data/helm_lite/writer/palmyra-x-v3/bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_lite/writer_palmyra-x-v3/1767657482.092302",
-  "retrieved_timestamp": "1767657482.092302",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_lite/writer_palmyra-x-v3/1770829788.2883599",
+  "retrieved_timestamp": "1770829788.2883599",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Palmyra X V3 (72B)",
+    "name": "Palmyra X V3 72B",
     "id": "writer/palmyra-x-v3",
     "developer": "writer",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_lite",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -30,7 +34,6 @@
       "score_details": {
         "score": 0.679,
         "details": {
-          "description": null,
           "tab": "Accuracy",
           "Mean win rate - Efficiency": {
             "description": null,
@@ -44,10 +47,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NarrativeQA - F1",
+      "source_data": {
+        "dataset_name": "NarrativeQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -92,10 +104,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "source_data": {
+        "dataset_name": "NaturalQuestions (closed-book)",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
         "lower_is_better": false,
@@ -171,11 +192,20 @@
         }
       },
       "generation_config": {
-        "mode": "closedbook"
+        "additional_details": {
+          "mode": "closedbook"
+        }
       }
     },
     {
       "evaluation_name": "OpenbookQA - EM",
+      "source_data": {
+        "dataset_name": "OpenbookQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -221,12 +251,21 @@
         }
       },
       "generation_config": {
-        "dataset": "openbookqa",
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "dataset": "openbookqa",
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MMLU - EM",
+      "source_data": {
+        "dataset_name": "MMLU",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -272,18 +311,27 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "college_chemistry",
-          "computer_security",
-          "econometrics",
-          "us_foreign_policy"
-        ],
-        "method": "multiple_choice_joint"
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "college_chemistry",
+            "computer_security",
+            "econometrics",
+            "us_foreign_policy"
+          ],
+          "method": "multiple_choice_joint"
+        }
       }
     },
     {
       "evaluation_name": "MATH - Equivalent (CoT)",
+      "source_data": {
+        "dataset_name": "MATH",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
         "lower_is_better": false,
@@ -329,22 +377,31 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "algebra",
-          "counting_and_probability",
-          "geometry",
-          "intermediate_algebra",
-          "number_theory",
-          "prealgebra",
-          "precalculus"
-        ],
-        "level": "1",
-        "use_official_examples": "False",
-        "use_chain_of_thought": "True"
+        "additional_details": {
+          "subject": [
+            "algebra",
+            "counting_and_probability",
+            "geometry",
+            "intermediate_algebra",
+            "number_theory",
+            "prealgebra",
+            "precalculus"
+          ],
+          "level": "1",
+          "use_official_examples": "False",
+          "use_chain_of_thought": "True"
+        }
       }
     },
     {
       "evaluation_name": "GSM8K - EM",
+      "source_data": {
+        "dataset_name": "GSM8K",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
         "lower_is_better": false,
@@ -389,10 +446,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "LegalBench - EM",
+      "source_data": {
+        "dataset_name": "LegalBench",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -438,17 +504,26 @@
         }
       },
       "generation_config": {
-        "subset": [
-          "abercrombie",
-          "corporate_lobbying",
-          "function_of_decision_section",
-          "international_citizenship_questions",
-          "proa"
-        ]
+        "additional_details": {
+          "subset": [
+            "abercrombie",
+            "corporate_lobbying",
+            "function_of_decision_section",
+            "international_citizenship_questions",
+            "proa"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "MedQA - EM",
+      "source_data": {
+        "dataset_name": "MedQA",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
         "lower_is_better": false,
@@ -493,10 +568,19 @@
           }
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     },
     {
       "evaluation_name": "WMT 2014 - BLEU-4",
+      "source_data": {
+        "dataset_name": "WMT 2014",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
         "lower_is_better": false,
@@ -542,13 +626,15 @@
         }
       },
       "generation_config": {
-        "language_pair": [
-          "cs-en",
-          "de-en",
-          "fr-en",
-          "hi-en",
-          "ru-en"
-        ]
+        "additional_details": {
+          "language_pair": [
+            "cs-en",
+            "de-en",
+            "fr-en",
+            "hi-en",
+            "ru-en"
+          ]
+        }
       }
     }
   ]
diff --git a/data/helm_mmlu/01-ai/yi-34b/73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json b/data/helm_mmlu/01-ai/yi-34b/3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json
similarity index 82%
rename from data/helm_mmlu/01-ai/yi-34b/73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json
rename to data/helm_mmlu/01-ai/yi-34b/3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json
index a5bdb42fc..cdb3ca461 100644
--- a/data/helm_mmlu/01-ai/yi-34b/73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json
+++ b/data/helm_mmlu/01-ai/yi-34b/3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/01-ai_yi-34b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/01-ai_yi-34b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Yi (34B)",
+    "name": "Yi 34B",
     "id": "01-ai/yi-34b",
     "developer": "01-ai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.315,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/01-ai/yi-6b/97569bf5-1e12-4baa-80cc-019be1725ebb.json b/data/helm_mmlu/01-ai/yi-6b/6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json
similarity index 82%
rename from data/helm_mmlu/01-ai/yi-6b/97569bf5-1e12-4baa-80cc-019be1725ebb.json
rename to data/helm_mmlu/01-ai/yi-6b/6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json
index 5b2c50278..1b8b7e56f 100644
--- a/data/helm_mmlu/01-ai/yi-6b/97569bf5-1e12-4baa-80cc-019be1725ebb.json
+++ b/data/helm_mmlu/01-ai/yi-6b/6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/01-ai_yi-6b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/01-ai_yi-6b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Yi (6B)",
+    "name": "Yi 6B",
     "id": "01-ai/yi-6b",
     "developer": "01-ai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.651,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/01-ai/yi-large-preview/7c4b387f-45be-41cb-8102-cd738e60f99d.json b/data/helm_mmlu/01-ai/yi-large-preview/3d0b3d68-a853-4989-a35e-83ac6722c2da.json
similarity index 82%
rename from data/helm_mmlu/01-ai/yi-large-preview/7c4b387f-45be-41cb-8102-cd738e60f99d.json
rename to data/helm_mmlu/01-ai/yi-large-preview/3d0b3d68-a853-4989-a35e-83ac6722c2da.json
index 938fbc9f2..29bc15bb3 100644
--- a/data/helm_mmlu/01-ai/yi-large-preview/7c4b387f-45be-41cb-8102-cd738e60f99d.json
+++ b/data/helm_mmlu/01-ai/yi-large-preview/3d0b3d68-a853-4989-a35e-83ac6722c2da.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Yi Large (Preview)",
+    "name": "Yi Large Preview",
     "id": "01-ai/yi-large-preview",
     "developer": "01-ai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.258,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/ai21/jamba-1.5-large/027b7bd4-8943-4d2c-9674-15d33792d391.json b/data/helm_mmlu/ai21/jamba-1.5-large/ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json
similarity index 82%
rename from data/helm_mmlu/ai21/jamba-1.5-large/027b7bd4-8943-4d2c-9674-15d33792d391.json
rename to data/helm_mmlu/ai21/jamba-1.5-large/ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json
index b05362e32..2bf971f25 100644
--- a/data/helm_mmlu/ai21/jamba-1.5-large/027b7bd4-8943-4d2c-9674-15d33792d391.json
+++ b/data/helm_mmlu/ai21/jamba-1.5-large/ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.147,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/ai21/jamba-1.5-mini/e5ed6c70-6874-4671-abb0-25bbd82471b4.json b/data/helm_mmlu/ai21/jamba-1.5-mini/517e8027-6edd-482b-86f3-33b6c41a9609.json
similarity index 82%
rename from data/helm_mmlu/ai21/jamba-1.5-mini/e5ed6c70-6874-4671-abb0-25bbd82471b4.json
rename to data/helm_mmlu/ai21/jamba-1.5-mini/517e8027-6edd-482b-86f3-33b6c41a9609.json
index 374350118..7ffc27970 100644
--- a/data/helm_mmlu/ai21/jamba-1.5-mini/e5ed6c70-6874-4671-abb0-25bbd82471b4.json
+++ b/data/helm_mmlu/ai21/jamba-1.5-mini/517e8027-6edd-482b-86f3-33b6c41a9609.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.206,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/ai21/jamba-instruct/4e236f80-5d03-4547-b199-b8718439fbed.json b/data/helm_mmlu/ai21/jamba-instruct/f7c1c125-ad0f-4847-b880-4f705f1666c6.json
similarity index 82%
rename from data/helm_mmlu/ai21/jamba-instruct/4e236f80-5d03-4547-b199-b8718439fbed.json
rename to data/helm_mmlu/ai21/jamba-instruct/f7c1c125-ad0f-4847-b880-4f705f1666c6.json
index 2f32db71e..92ba45d60 100644
--- a/data/helm_mmlu/ai21/jamba-instruct/4e236f80-5d03-4547-b199-b8718439fbed.json
+++ b/data/helm_mmlu/ai21/jamba-instruct/f7c1c125-ad0f-4847-b880-4f705f1666c6.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.887,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/allenai/olmo-1.7-7b/1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json b/data/helm_mmlu/allenai/olmo-1.7-7b/5a0ba280-8a12-4735-9d92-4ed71ba395b4.json
similarity index 82%
rename from data/helm_mmlu/allenai/olmo-1.7-7b/1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json
rename to data/helm_mmlu/allenai/olmo-1.7-7b/5a0ba280-8a12-4735-9d92-4ed71ba395b4.json
index 0ee329ec3..e53150712 100644
--- a/data/helm_mmlu/allenai/olmo-1.7-7b/1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json
+++ b/data/helm_mmlu/allenai/olmo-1.7-7b/5a0ba280-8a12-4735-9d92-4ed71ba395b4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "OLMo 1.7 (7B)",
+    "name": "OLMo 1.7 7B",
     "id": "allenai/olmo-1.7-7b",
     "developer": "allenai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.196,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/allenai/olmo-7b/31666792-6d68-42da-95f8-3b9f8590c7fd.json b/data/helm_mmlu/allenai/olmo-7b/73ccc6a6-e10d-4619-914f-26032cddf8da.json
similarity index 82%
rename from data/helm_mmlu/allenai/olmo-7b/31666792-6d68-42da-95f8-3b9f8590c7fd.json
rename to data/helm_mmlu/allenai/olmo-7b/73ccc6a6-e10d-4619-914f-26032cddf8da.json
index dc71abcb3..301523f0f 100644
--- a/data/helm_mmlu/allenai/olmo-7b/31666792-6d68-42da-95f8-3b9f8590c7fd.json
+++ b/data/helm_mmlu/allenai/olmo-7b/73ccc6a6-e10d-4619-914f-26032cddf8da.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/allenai_olmo-7b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/allenai_olmo-7b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "OLMo (7B)",
+    "name": "OLMo 7B",
     "id": "allenai/olmo-7b",
     "developer": "allenai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.68,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/amazon/nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json b/data/helm_mmlu/amazon/nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json
similarity index 82%
rename from data/helm_mmlu/amazon/nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json
rename to data/helm_mmlu/amazon/nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json
index 036d68cdd..d80215b78 100644
--- a/data/helm_mmlu/amazon/nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json
+++ b/data/helm_mmlu/amazon/nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.987,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/amazon/nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json b/data/helm_mmlu/amazon/nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json
similarity index 82%
rename from data/helm_mmlu/amazon/nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json
rename to data/helm_mmlu/amazon/nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json
index dc2e53d31..f28fc4ccf 100644
--- a/data/helm_mmlu/amazon/nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json
+++ b/data/helm_mmlu/amazon/nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 1.0,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/amazon/nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json b/data/helm_mmlu/amazon/nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json
similarity index 82%
rename from data/helm_mmlu/amazon/nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json
rename to data/helm_mmlu/amazon/nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json
index 74dd04dc4..66455ef1d 100644
--- a/data/helm_mmlu/amazon/nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json
+++ b/data/helm_mmlu/amazon/nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.975,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-2.1/357edc36-d500-4e6e-94a4-6653b769b5d8.json b/data/helm_mmlu/anthropic/claude-2.1/aa8cae95-cb75-4241-951c-25e2046042dd.json
similarity index 82%
rename from data/helm_mmlu/anthropic/claude-2.1/357edc36-d500-4e6e-94a4-6653b769b5d8.json
rename to data/helm_mmlu/anthropic/claude-2.1/aa8cae95-cb75-4241-951c-25e2046042dd.json
index 94c86600d..163a9d31a 100644
--- a/data/helm_mmlu/anthropic/claude-2.1/357edc36-d500-4e6e-94a4-6653b769b5d8.json
+++ b/data/helm_mmlu/anthropic/claude-2.1/aa8cae95-cb75-4241-951c-25e2046042dd.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.048,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/67f72a7f-15b7-4a2e-b478-38091cba2189.json b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/c88e4a03-22ae-4338-bf5f-36070814136a.json
similarity index 82%
rename from data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/67f72a7f-15b7-4a2e-b478-38091cba2189.json
rename to data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/c88e4a03-22ae-4338-bf5f-36070814136a.json
index 15ba960b1..edabc3b81 100644
--- a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/67f72a7f-15b7-4a2e-b478-38091cba2189.json
+++ b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/c88e4a03-22ae-4338-bf5f-36070814136a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Haiku (20241022)",
+    "name": "Claude 3.5 Haiku 20241022",
     "id": "anthropic/claude-3-5-haiku-20241022",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.128,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json
similarity index 82%
rename from data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json
rename to data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json
index 43e320af9..8d402d4fb 100644
--- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json
+++ b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Sonnet (20240620)",
+    "name": "Claude 3.5 Sonnet 20240620",
     "id": "anthropic/claude-3-5-sonnet-20240620",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.17,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json
similarity index 82%
rename from data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json
rename to data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json
index 7df36bb32..a435d5c4d 100644
--- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json
+++ b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3.5 Sonnet (20241022)",
+    "name": "Claude 3.5 Sonnet 20241022",
     "id": "anthropic/claude-3-5-sonnet-20241022",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.311,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/b0218eab-984f-4829-90d6-e7fc6f60c530.json b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/097a8da1-f411-4359-8440-2ab06f4ae76c.json
similarity index 82%
rename from data/helm_mmlu/anthropic/claude-3-haiku-20240307/b0218eab-984f-4829-90d6-e7fc6f60c530.json
rename to data/helm_mmlu/anthropic/claude-3-haiku-20240307/097a8da1-f411-4359-8440-2ab06f4ae76c.json
index 9885a79d4..66b68fa6b 100644
--- a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/b0218eab-984f-4829-90d6-e7fc6f60c530.json
+++ b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/097a8da1-f411-4359-8440-2ab06f4ae76c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3 Haiku (20240307)",
+    "name": "Claude 3 Haiku 20240307",
     "id": "anthropic/claude-3-haiku-20240307",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.28,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-opus-20240229/fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json b/data/helm_mmlu/anthropic/claude-3-opus-20240229/68130abd-1df5-4cd3-919a-2863e9f013c7.json
similarity index 82%
rename from data/helm_mmlu/anthropic/claude-3-opus-20240229/fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json
rename to data/helm_mmlu/anthropic/claude-3-opus-20240229/68130abd-1df5-4cd3-919a-2863e9f013c7.json
index ab57a1503..140c0db28 100644
--- a/data/helm_mmlu/anthropic/claude-3-opus-20240229/fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json
+++ b/data/helm_mmlu/anthropic/claude-3-opus-20240229/68130abd-1df5-4cd3-919a-2863e9f013c7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3 Opus (20240229)",
+    "name": "Claude 3 Opus 20240229",
     "id": "anthropic/claude-3-opus-20240229",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.014,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/08d951d1-2912-4a00-99ce-f90340a7fd2a.json b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/5d8d795a-d213-4b96-9b17-ad5fae6b3687.json
similarity index 82%
rename from data/helm_mmlu/anthropic/claude-3-sonnet-20240229/08d951d1-2912-4a00-99ce-f90340a7fd2a.json
rename to data/helm_mmlu/anthropic/claude-3-sonnet-20240229/5d8d795a-d213-4b96-9b17-ad5fae6b3687.json
index 710c70a2e..a6eb131df 100644
--- a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/08d951d1-2912-4a00-99ce-f90340a7fd2a.json
+++ b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/5d8d795a-d213-4b96-9b17-ad5fae6b3687.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Claude 3 Sonnet (20240229)",
+    "name": "Claude 3 Sonnet 20240229",
     "id": "anthropic/claude-3-sonnet-20240229",
     "developer": "anthropic",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.082,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-instant-1.2/bfff8f1b-24cc-41b8-b11c-85ee48bef059.json b/data/helm_mmlu/anthropic/claude-instant-1.2/7908da03-f030-4c62-a121-c04bd94ea75e.json
similarity index 82%
rename from data/helm_mmlu/anthropic/claude-instant-1.2/bfff8f1b-24cc-41b8-b11c-85ee48bef059.json
rename to data/helm_mmlu/anthropic/claude-instant-1.2/7908da03-f030-4c62-a121-c04bd94ea75e.json
index b632a0864..38a7ffacb 100644
--- a/data/helm_mmlu/anthropic/claude-instant-1.2/bfff8f1b-24cc-41b8-b11c-85ee48bef059.json
+++ b/data/helm_mmlu/anthropic/claude-instant-1.2/7908da03-f030-4c62-a121-c04bd94ea75e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.186,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/cohere/command-r-plus/f1509273-dea1-477e-bf04-02767838c1f9.json b/data/helm_mmlu/cohere/command-r-plus/c6fdbf96-2500-4410-8fcd-268ea3e16062.json
similarity index 82%
rename from data/helm_mmlu/cohere/command-r-plus/f1509273-dea1-477e-bf04-02767838c1f9.json
rename to data/helm_mmlu/cohere/command-r-plus/c6fdbf96-2500-4410-8fcd-268ea3e16062.json
index 6ef0cc597..4b85be9b2 100644
--- a/data/helm_mmlu/cohere/command-r-plus/f1509273-dea1-477e-bf04-02767838c1f9.json
+++ b/data/helm_mmlu/cohere/command-r-plus/c6fdbf96-2500-4410-8fcd-268ea3e16062.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/cohere_command-r-plus/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/cohere_command-r-plus/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.825,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/cohere/command-r/45524eef-0678-47db-8620-a5f55e166e63.json b/data/helm_mmlu/cohere/command-r/537164c3-7b88-4543-b19d-370f55a25a66.json
similarity index 82%
rename from data/helm_mmlu/cohere/command-r/45524eef-0678-47db-8620-a5f55e166e63.json
rename to data/helm_mmlu/cohere/command-r/537164c3-7b88-4543-b19d-370f55a25a66.json
index 6fa172bf8..90cbd571c 100644
--- a/data/helm_mmlu/cohere/command-r/45524eef-0678-47db-8620-a5f55e166e63.json
+++ b/data/helm_mmlu/cohere/command-r/537164c3-7b88-4543-b19d-370f55a25a66.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/cohere_command-r/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/cohere_command-r/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.959,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/databricks/dbrx-instruct/cd2371e9-e552-4944-bc30-c2269c960e16.json b/data/helm_mmlu/databricks/dbrx-instruct/0c539e26-8403-42db-acfc-7953dd80ae20.json
similarity index 82%
rename from data/helm_mmlu/databricks/dbrx-instruct/cd2371e9-e552-4944-bc30-c2269c960e16.json
rename to data/helm_mmlu/databricks/dbrx-instruct/0c539e26-8403-42db-acfc-7953dd80ae20.json
index 8d0b57f82..753506525 100644
--- a/data/helm_mmlu/databricks/dbrx-instruct/cd2371e9-e552-4944-bc30-c2269c960e16.json
+++ b/data/helm_mmlu/databricks/dbrx-instruct/0c539e26-8403-42db-acfc-7953dd80ae20.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.537,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/364c7490-8bb1-4e7e-b485-fb3c2224da58.json
similarity index 82%
rename from data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json
rename to data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/364c7490-8bb1-4e7e-b485-fb3c2224da58.json
index 7837e5696..f12e3799a 100644
--- a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json
+++ b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/364c7490-8bb1-4e7e-b485-fb3c2224da58.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "DeepSeek LLM Chat (67B)",
+    "name": "DeepSeek LLM Chat 67B",
     "id": "deepseek-ai/deepseek-llm-67b-chat",
     "developer": "deepseek-ai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.387,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/deepseek-ai/deepseek-v3/87716ef9-56bb-4737-b578-9e53742c714a.json b/data/helm_mmlu/deepseek-ai/deepseek-v3/1a9167d2-882c-4582-b4e0-ac425896a317.json
similarity index 82%
rename from data/helm_mmlu/deepseek-ai/deepseek-v3/87716ef9-56bb-4737-b578-9e53742c714a.json
rename to data/helm_mmlu/deepseek-ai/deepseek-v3/1a9167d2-882c-4582-b4e0-ac425896a317.json
index b9d5d50e7..86fd9dec9 100644
--- a/data/helm_mmlu/deepseek-ai/deepseek-v3/87716ef9-56bb-4737-b578-9e53742c714a.json
+++ b/data/helm_mmlu/deepseek-ai/deepseek-v3/1a9167d2-882c-4582-b4e0-ac425896a317.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.215,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.0-pro-001/8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json b/data/helm_mmlu/google/gemini-1.0-pro-001/8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json
similarity index 82%
rename from data/helm_mmlu/google/gemini-1.0-pro-001/8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json
rename to data/helm_mmlu/google/gemini-1.0-pro-001/8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json
index 4fb164090..0184241c6 100644
--- a/data/helm_mmlu/google/gemini-1.0-pro-001/8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json
+++ b/data/helm_mmlu/google/gemini-1.0-pro-001/8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.0 Pro (001)",
+    "name": "Gemini 1.0 Pro 001",
     "id": "google/gemini-1.0-pro-001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.677,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-flash-001/ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json b/data/helm_mmlu/google/gemini-1.5-flash-001/d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json
similarity index 82%
rename from data/helm_mmlu/google/gemini-1.5-flash-001/ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json
rename to data/helm_mmlu/google/gemini-1.5-flash-001/d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json
index a91e47447..7baa6457e 100644
--- a/data/helm_mmlu/google/gemini-1.5-flash-001/ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json
+++ b/data/helm_mmlu/google/gemini-1.5-flash-001/d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Flash (001)",
+    "name": "Gemini 1.5 Flash 001",
     "id": "google/gemini-1.5-flash-001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.47,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-flash-002/ec78481a-0b0d-4709-99ea-6423372d6038.json b/data/helm_mmlu/google/gemini-1.5-flash-002/a94c9e13-dca7-4e02-a795-09d9274354d3.json
similarity index 82%
rename from data/helm_mmlu/google/gemini-1.5-flash-002/ec78481a-0b0d-4709-99ea-6423372d6038.json
rename to data/helm_mmlu/google/gemini-1.5-flash-002/a94c9e13-dca7-4e02-a795-09d9274354d3.json
index c8a9b1912..f095d6361 100644
--- a/data/helm_mmlu/google/gemini-1.5-flash-002/ec78481a-0b0d-4709-99ea-6423372d6038.json
+++ b/data/helm_mmlu/google/gemini-1.5-flash-002/a94c9e13-dca7-4e02-a795-09d9274354d3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Flash (002)",
+    "name": "Gemini 1.5 Flash 002",
     "id": "google/gemini-1.5-flash-002",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.817,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/2a8845b3-cdbc-409c-8346-f83fb607999a.json b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/75c8b20f-a4d4-4699-be79-f027bf7f0d69.json
similarity index 82%
rename from data/helm_mmlu/google/gemini-1.5-flash-preview-0514/2a8845b3-cdbc-409c-8346-f83fb607999a.json
rename to data/helm_mmlu/google/gemini-1.5-flash-preview-0514/75c8b20f-a4d4-4699-be79-f027bf7f0d69.json
index ffdf7910d..fe99bd4e4 100644
--- a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/2a8845b3-cdbc-409c-8346-f83fb607999a.json
+++ b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/75c8b20f-a4d4-4699-be79-f027bf7f0d69.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Flash (0514 preview)",
+    "name": "Gemini 1.5 Flash 0514 preview",
     "id": "google/gemini-1.5-flash-preview-0514",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.713,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-pro-001/486b6479-f327-43ab-af2c-8824abaf5fe6.json b/data/helm_mmlu/google/gemini-1.5-pro-001/264be7b4-08b7-40b6-a5e7-f3536f361450.json
similarity index 82%
rename from data/helm_mmlu/google/gemini-1.5-pro-001/486b6479-f327-43ab-af2c-8824abaf5fe6.json
rename to data/helm_mmlu/google/gemini-1.5-pro-001/264be7b4-08b7-40b6-a5e7-f3536f361450.json
index 0115a3fa0..4b9fc2846 100644
--- a/data/helm_mmlu/google/gemini-1.5-pro-001/486b6479-f327-43ab-af2c-8824abaf5fe6.json
+++ b/data/helm_mmlu/google/gemini-1.5-pro-001/264be7b4-08b7-40b6-a5e7-f3536f361450.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Pro (001)",
+    "name": "Gemini 1.5 Pro 001",
     "id": "google/gemini-1.5-pro-001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.349,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-pro-002/4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json b/data/helm_mmlu/google/gemini-1.5-pro-002/83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json
similarity index 82%
rename from data/helm_mmlu/google/gemini-1.5-pro-002/4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json
rename to data/helm_mmlu/google/gemini-1.5-pro-002/83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json
index 1c57dbb48..47f80252d 100644
--- a/data/helm_mmlu/google/gemini-1.5-pro-002/4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json
+++ b/data/helm_mmlu/google/gemini-1.5-pro-002/83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Pro (002)",
+    "name": "Gemini 1.5 Pro 002",
     "id": "google/gemini-1.5-pro-002",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.334,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/8a013eb3-0f21-4a50-8a53-4ba977951130.json
similarity index 82%
rename from data/helm_mmlu/google/gemini-1.5-pro-preview-0409/bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json
rename to data/helm_mmlu/google/gemini-1.5-pro-preview-0409/8a013eb3-0f21-4a50-8a53-4ba977951130.json
index 065435cc3..901c1dd01 100644
--- a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json
+++ b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/8a013eb3-0f21-4a50-8a53-4ba977951130.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 1.5 Pro (0409 preview)",
+    "name": "Gemini 1.5 Pro 0409 preview",
     "id": "google/gemini-1.5-pro-preview-0409",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.118,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-2.0-flash-exp/0837a2fd-1f25-4133-9ce6-b8ca29830f70.json b/data/helm_mmlu/google/gemini-2.0-flash-exp/7b081a40-7cb6-4405-b842-3db95f290dfa.json
similarity index 82%
rename from data/helm_mmlu/google/gemini-2.0-flash-exp/0837a2fd-1f25-4133-9ce6-b8ca29830f70.json
rename to data/helm_mmlu/google/gemini-2.0-flash-exp/7b081a40-7cb6-4405-b842-3db95f290dfa.json
index 9b4101c21..0eda6b6b1 100644
--- a/data/helm_mmlu/google/gemini-2.0-flash-exp/0837a2fd-1f25-4133-9ce6-b8ca29830f70.json
+++ b/data/helm_mmlu/google/gemini-2.0-flash-exp/7b081a40-7cb6-4405-b842-3db95f290dfa.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemini 2.0 Flash (Experimental)",
+    "name": "Gemini 2.0 Flash Experimental",
     "id": "google/gemini-2.0-flash-exp",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.567,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemma-2-27b/b732e4c3-526e-42b3-8003-defe6f99dec5.json b/data/helm_mmlu/google/gemma-2-27b/54185b53-9891-43c6-8f93-09ff02b728d8.json
similarity index 82%
rename from data/helm_mmlu/google/gemma-2-27b/b732e4c3-526e-42b3-8003-defe6f99dec5.json
rename to data/helm_mmlu/google/gemma-2-27b/54185b53-9891-43c6-8f93-09ff02b728d8.json
index 2a0eccbe5..142296fc4 100644
--- a/data/helm_mmlu/google/gemma-2-27b/b732e4c3-526e-42b3-8003-defe6f99dec5.json
+++ b/data/helm_mmlu/google/gemma-2-27b/54185b53-9891-43c6-8f93-09ff02b728d8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemma-2-27b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemma-2-27b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemma 2 (27B)",
+    "name": "Gemma 2 27B",
     "id": "google/gemma-2-27b",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.05,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemma-2-9b/72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json b/data/helm_mmlu/google/gemma-2-9b/884c194d-6519-4bd4-8add-6514e593c514.json
similarity index 82%
rename from data/helm_mmlu/google/gemma-2-9b/72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json
rename to data/helm_mmlu/google/gemma-2-9b/884c194d-6519-4bd4-8add-6514e593c514.json
index 7b83a32f9..6f84fd47f 100644
--- a/data/helm_mmlu/google/gemma-2-9b/72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json
+++ b/data/helm_mmlu/google/gemma-2-9b/884c194d-6519-4bd4-8add-6514e593c514.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemma-2-9b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemma-2-9b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemma 2 (9B)",
+    "name": "Gemma 2 9B",
     "id": "google/gemma-2-9b",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.265,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemma-7b/11b66d50-28d9-42bc-8f91-463b02fa96f7.json b/data/helm_mmlu/google/gemma-7b/a80cbd76-bcf8-4174-b0b3-346fae152bdb.json
similarity index 82%
rename from data/helm_mmlu/google/gemma-7b/11b66d50-28d9-42bc-8f91-463b02fa96f7.json
rename to data/helm_mmlu/google/gemma-7b/a80cbd76-bcf8-4174-b0b3-346fae152bdb.json
index 1480d9d56..ac525859f 100644
--- a/data/helm_mmlu/google/gemma-7b/11b66d50-28d9-42bc-8f91-463b02fa96f7.json
+++ b/data/helm_mmlu/google/gemma-7b/a80cbd76-bcf8-4174-b0b3-346fae152bdb.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_gemma-7b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_gemma-7b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Gemma (7B)",
+    "name": "Gemma 7B",
     "id": "google/gemma-7b",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.824,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/text-bison@001/70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json b/data/helm_mmlu/google/text-bison@001/5f105986-aa7d-4858-91bc-cece9d0085ba.json
similarity index 82%
rename from data/helm_mmlu/google/text-bison@001/70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json
rename to data/helm_mmlu/google/text-bison@001/5f105986-aa7d-4858-91bc-cece9d0085ba.json
index a20b853b7..b20dbe54d 100644
--- a/data/helm_mmlu/google/text-bison@001/70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json
+++ b/data/helm_mmlu/google/text-bison@001/5f105986-aa7d-4858-91bc-cece9d0085ba.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_text-bison@001/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_text-bison@001/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "PaLM-2 (Bison)",
+    "name": "PaLM-2 Bison",
     "id": "google/text-bison@001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.192,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/google/text-unicorn@001/c2e53d3a-b85c-4888-8b20-225db39301ab.json b/data/helm_mmlu/google/text-unicorn@001/528b7b4e-c8a6-4387-bd98-497a3316029d.json
similarity index 82%
rename from data/helm_mmlu/google/text-unicorn@001/c2e53d3a-b85c-4888-8b20-225db39301ab.json
rename to data/helm_mmlu/google/text-unicorn@001/528b7b4e-c8a6-4387-bd98-497a3316029d.json
index 061cfda40..7b3536f41 100644
--- a/data/helm_mmlu/google/text-unicorn@001/c2e53d3a-b85c-4888-8b20-225db39301ab.json
+++ b/data/helm_mmlu/google/text-unicorn@001/528b7b4e-c8a6-4387-bd98-497a3316029d.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/google_text-unicorn@001/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/google_text-unicorn@001/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "PaLM-2 (Unicorn)",
+    "name": "PaLM-2 Unicorn",
     "id": "google/text-unicorn@001",
     "developer": "google",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.142,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-2-13b/a477c332-b082-4ad5-8d2f-905690e9d211.json b/data/helm_mmlu/meta/llama-2-13b/96eb34db-66bd-4945-8b4c-a8c1394fe56a.json
similarity index 82%
rename from data/helm_mmlu/meta/llama-2-13b/a477c332-b082-4ad5-8d2f-905690e9d211.json
rename to data/helm_mmlu/meta/llama-2-13b/96eb34db-66bd-4945-8b4c-a8c1394fe56a.json
index 999bc7bce..a786ac0dd 100644
--- a/data/helm_mmlu/meta/llama-2-13b/a477c332-b082-4ad5-8d2f-905690e9d211.json
+++ b/data/helm_mmlu/meta/llama-2-13b/96eb34db-66bd-4945-8b4c-a8c1394fe56a.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-2-13b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-2-13b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 2 (13B)",
+    "name": "Llama 2 13B",
     "id": "meta/llama-2-13b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.502,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-2-70b/ba574f5e-cc59-4994-a595-e6472c032fc4.json b/data/helm_mmlu/meta/llama-2-70b/961e917b-0e67-462c-b9d0-0fe4b4b85beb.json
similarity index 82%
rename from data/helm_mmlu/meta/llama-2-70b/ba574f5e-cc59-4994-a595-e6472c032fc4.json
rename to data/helm_mmlu/meta/llama-2-70b/961e917b-0e67-462c-b9d0-0fe4b4b85beb.json
index 2bd647ad6..bd988b6d8 100644
--- a/data/helm_mmlu/meta/llama-2-70b/ba574f5e-cc59-4994-a595-e6472c032fc4.json
+++ b/data/helm_mmlu/meta/llama-2-70b/961e917b-0e67-462c-b9d0-0fe4b4b85beb.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-2-70b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-2-70b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 2 (70B)",
+    "name": "Llama 2 70B",
     "id": "meta/llama-2-70b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.508,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-2-7b/9cfa7f91-bfd0-4f02-988c-1978df8db303.json b/data/helm_mmlu/meta/llama-2-7b/59a85d2c-16ce-4ed4-bc65-f6898127fa57.json
similarity index 82%
rename from data/helm_mmlu/meta/llama-2-7b/9cfa7f91-bfd0-4f02-988c-1978df8db303.json
rename to data/helm_mmlu/meta/llama-2-7b/59a85d2c-16ce-4ed4-bc65-f6898127fa57.json
index f7641555c..b29cd7460 100644
--- a/data/helm_mmlu/meta/llama-2-7b/9cfa7f91-bfd0-4f02-988c-1978df8db303.json
+++ b/data/helm_mmlu/meta/llama-2-7b/59a85d2c-16ce-4ed4-bc65-f6898127fa57.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-2-7b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-2-7b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 2 (7B)",
+    "name": "Llama 2 7B",
     "id": "meta/llama-2-7b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.681,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3-70b/607a4b9b-3442-4690-b116-a927c6822fb3.json b/data/helm_mmlu/meta/llama-3-70b/16a8b446-51fc-4c23-9231-46ee16c1c0a8.json
similarity index 82%
rename from data/helm_mmlu/meta/llama-3-70b/607a4b9b-3442-4690-b116-a927c6822fb3.json
rename to data/helm_mmlu/meta/llama-3-70b/16a8b446-51fc-4c23-9231-46ee16c1c0a8.json
index 028924f0a..d46d7f50a 100644
--- a/data/helm_mmlu/meta/llama-3-70b/607a4b9b-3442-4690-b116-a927c6822fb3.json
+++ b/data/helm_mmlu/meta/llama-3-70b/16a8b446-51fc-4c23-9231-46ee16c1c0a8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3-70b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3-70b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3 (70B)",
+    "name": "Llama 3 70B",
     "id": "meta/llama-3-70b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.524,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3-8b/44decfe6-57ed-4677-a859-4fe5ae25b237.json b/data/helm_mmlu/meta/llama-3-8b/f4de7e58-7060-440b-8f6f-1f79d7499d1e.json
similarity index 82%
rename from data/helm_mmlu/meta/llama-3-8b/44decfe6-57ed-4677-a859-4fe5ae25b237.json
rename to data/helm_mmlu/meta/llama-3-8b/f4de7e58-7060-440b-8f6f-1f79d7499d1e.json
index 493305a26..31dfddc02 100644
--- a/data/helm_mmlu/meta/llama-3-8b/44decfe6-57ed-4677-a859-4fe5ae25b237.json
+++ b/data/helm_mmlu/meta/llama-3-8b/f4de7e58-7060-440b-8f6f-1f79d7499d1e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3-8b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3-8b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3 (8B)",
+    "name": "Llama 3 8B",
     "id": "meta/llama-3-8b",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.733,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/af78c3b5-5d91-431d-85ac-783b5a324723.json b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json
similarity index 82%
rename from data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/af78c3b5-5d91-431d-85ac-783b5a324723.json
rename to data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json
index 5e68e1b5a..64eb43090 100644
--- a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/af78c3b5-5d91-431d-85ac-783b5a324723.json
+++ b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (405B)",
+    "name": "Llama 3.1 Instruct Turbo 405B",
     "id": "meta/llama-3.1-405b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.33,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/1224cee0-22f8-41b0-a7da-8a6100001a3e.json b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/dc6aa933-67e4-4811-b3e2-e5200c002abe.json
similarity index 82%
rename from data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/1224cee0-22f8-41b0-a7da-8a6100001a3e.json
rename to data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/dc6aa933-67e4-4811-b3e2-e5200c002abe.json
index 7f880e52b..149eb0100 100644
--- a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/1224cee0-22f8-41b0-a7da-8a6100001a3e.json
+++ b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/dc6aa933-67e4-4811-b3e2-e5200c002abe.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (70B)",
+    "name": "Llama 3.1 Instruct Turbo 70B",
     "id": "meta/llama-3.1-70b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.021,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/2cb2551b-dbca-46d9-a19a-165d1ac60dee.json b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/5f9758a3-fd6d-4598-930a-9c01420d05e8.json
similarity index 82%
rename from data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/2cb2551b-dbca-46d9-a19a-165d1ac60dee.json
rename to data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/5f9758a3-fd6d-4598-930a-9c01420d05e8.json
index bdc0510b6..46bd04117 100644
--- a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/2cb2551b-dbca-46d9-a19a-165d1ac60dee.json
+++ b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/5f9758a3-fd6d-4598-930a-9c01420d05e8.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.1 Instruct Turbo (8B)",
+    "name": "Llama 3.1 Instruct Turbo 8B",
     "id": "meta/llama-3.1-8b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.475,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/3c53ce3d-4ee8-483c-be9f-964395103289.json b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/7592c0d8-a06c-4189-81a1-dbf794d22c8b.json
similarity index 82%
rename from data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/3c53ce3d-4ee8-483c-be9f-964395103289.json
rename to data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/7592c0d8-a06c-4189-81a1-dbf794d22c8b.json
index e9ec2f904..187d1c6a7 100644
--- a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/3c53ce3d-4ee8-483c-be9f-964395103289.json
+++ b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/7592c0d8-a06c-4189-81a1-dbf794d22c8b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.2 Vision Instruct Turbo (11B)",
+    "name": "Llama 3.2 Vision Instruct Turbo 11B",
     "id": "meta/llama-3.2-11b-vision-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.897,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/11e364be-39e9-4b42-97d7-ab771f17973c.json b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/83c0e8e3-087c-4d61-9153-e571b4971871.json
similarity index 82%
rename from data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/11e364be-39e9-4b42-97d7-ab771f17973c.json
rename to data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/83c0e8e3-087c-4d61-9153-e571b4971871.json
index 51cb25f1e..9625c1e16 100644
--- a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/11e364be-39e9-4b42-97d7-ab771f17973c.json
+++ b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/83c0e8e3-087c-4d61-9153-e571b4971871.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.2 Vision Instruct Turbo (90B)",
+    "name": "Llama 3.2 Vision Instruct Turbo 90B",
     "id": "meta/llama-3.2-90b-vision-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.773,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/bbcf8f14-600c-4c93-b63d-64aabcab23a3.json b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json
similarity index 82%
rename from data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/bbcf8f14-600c-4c93-b63d-64aabcab23a3.json
rename to data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json
index 124028675..8effae129 100644
--- a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/bbcf8f14-600c-4c93-b63d-64aabcab23a3.json
+++ b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Llama 3.3 Instruct Turbo (70B)",
+    "name": "Llama 3.3 Instruct Turbo 70B",
     "id": "meta/llama-3.3-70b-instruct-turbo",
     "developer": "meta",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.722,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/microsoft/phi-2/91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json b/data/helm_mmlu/microsoft/phi-2/5baac093-babb-41cd-a2f4-985d0b91be37.json
similarity index 82%
rename from data/helm_mmlu/microsoft/phi-2/91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json
rename to data/helm_mmlu/microsoft/phi-2/5baac093-babb-41cd-a2f4-985d0b91be37.json
index f3162d0fe..07027b7fb 100644
--- a/data/helm_mmlu/microsoft/phi-2/91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json
+++ b/data/helm_mmlu/microsoft/phi-2/5baac093-babb-41cd-a2f4-985d0b91be37.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/microsoft_phi-2/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/microsoft_phi-2/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.824,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/1bf54088-ba12-45b4-8f80-63d5c38f58f6.json
similarity index 82%
rename from data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json
rename to data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/1bf54088-ba12-45b4-8f80-63d5c38f58f6.json
index 97f9c3c96..6ce22179c 100644
--- a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json
+++ b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/1bf54088-ba12-45b4-8f80-63d5c38f58f6.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Phi-3 (14B)",
+    "name": "Phi-3 14B",
     "id": "microsoft/phi-3-medium-4k-instruct",
     "developer": "microsoft",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.015,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/16c66bdf-dda3-4b12-b38c-73abee6a702f.json b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/5ed0a970-200f-4f23-9623-e714afa49ddf.json
similarity index 82%
rename from data/helm_mmlu/microsoft/phi-3-small-8k-instruct/16c66bdf-dda3-4b12-b38c-73abee6a702f.json
rename to data/helm_mmlu/microsoft/phi-3-small-8k-instruct/5ed0a970-200f-4f23-9623-e714afa49ddf.json
index 9da3cad91..7278b002a 100644
--- a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/16c66bdf-dda3-4b12-b38c-73abee6a702f.json
+++ b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/5ed0a970-200f-4f23-9623-e714afa49ddf.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Phi-3 (7B)",
+    "name": "Phi-3 7B",
     "id": "microsoft/phi-3-small-8k-instruct",
     "developer": "microsoft",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.708,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/d0783259-681a-438f-b7dc-1c625a0be8ba.json b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/e7fd06a6-65e5-4f88-8e86-c513f78e31db.json
similarity index 82%
rename from data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/d0783259-681a-438f-b7dc-1c625a0be8ba.json
rename to data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/e7fd06a6-65e5-4f88-8e86-c513f78e31db.json
index 2592b75a7..886ff1732 100644
--- a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/d0783259-681a-438f-b7dc-1c625a0be8ba.json
+++ b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/e7fd06a6-65e5-4f88-8e86-c513f78e31db.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Instruct v0.3 (7B)",
+    "name": "Mistral Instruct v0.3 7B",
     "id": "mistralai/mistral-7b-instruct-v0.3",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.509,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-7b-v0.1/a05ce725-cdf0-4fe3-88b9-8631229e4443.json b/data/helm_mmlu/mistralai/mistral-7b-v0.1/ac047aef-008f-4c87-a6d5-4f331ebf5c53.json
similarity index 82%
rename from data/helm_mmlu/mistralai/mistral-7b-v0.1/a05ce725-cdf0-4fe3-88b9-8631229e4443.json
rename to data/helm_mmlu/mistralai/mistral-7b-v0.1/ac047aef-008f-4c87-a6d5-4f331ebf5c53.json
index 77ee3f1a1..935804d7f 100644
--- a/data/helm_mmlu/mistralai/mistral-7b-v0.1/a05ce725-cdf0-4fe3-88b9-8631229e4443.json
+++ b/data/helm_mmlu/mistralai/mistral-7b-v0.1/ac047aef-008f-4c87-a6d5-4f331ebf5c53.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral v0.1 (7B)",
+    "name": "Mistral v0.1 7B",
     "id": "mistralai/mistral-7b-v0.1",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.213,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-large-2402/0dee4200-c4f0-438e-8d0d-ca92515c6e33.json b/data/helm_mmlu/mistralai/mistral-large-2402/ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json
similarity index 82%
rename from data/helm_mmlu/mistralai/mistral-large-2402/0dee4200-c4f0-438e-8d0d-ca92515c6e33.json
rename to data/helm_mmlu/mistralai/mistral-large-2402/ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json
index c34e3e47f..bc72ce600 100644
--- a/data/helm_mmlu/mistralai/mistral-large-2402/0dee4200-c4f0-438e-8d0d-ca92515c6e33.json
+++ b/data/helm_mmlu/mistralai/mistral-large-2402/ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Large (2402)",
+    "name": "Mistral Large 2402",
     "id": "mistralai/mistral-large-2402",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.464,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-large-2407/2869d585-567d-4ddc-ac38-3e036061b13e.json b/data/helm_mmlu/mistralai/mistral-large-2407/7517b6c9-c613-416c-aadb-39fd6d252da7.json
similarity index 82%
rename from data/helm_mmlu/mistralai/mistral-large-2407/2869d585-567d-4ddc-ac38-3e036061b13e.json
rename to data/helm_mmlu/mistralai/mistral-large-2407/7517b6c9-c613-416c-aadb-39fd6d252da7.json
index 4e005a631..272dc142d 100644
--- a/data/helm_mmlu/mistralai/mistral-large-2407/2869d585-567d-4ddc-ac38-3e036061b13e.json
+++ b/data/helm_mmlu/mistralai/mistral-large-2407/7517b6c9-c613-416c-aadb-39fd6d252da7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Large 2 (2407)",
+    "name": "Mistral Large 2 2407",
     "id": "mistralai/mistral-large-2407",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.24,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-small-2402/d277cca3-64da-4e4b-9210-3f5b910c975c.json b/data/helm_mmlu/mistralai/mistral-small-2402/85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json
similarity index 82%
rename from data/helm_mmlu/mistralai/mistral-small-2402/d277cca3-64da-4e4b-9210-3f5b910c975c.json
rename to data/helm_mmlu/mistralai/mistral-small-2402/85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json
index ddc506063..607dc1e03 100644
--- a/data/helm_mmlu/mistralai/mistral-small-2402/d277cca3-64da-4e4b-9210-3f5b910c975c.json
+++ b/data/helm_mmlu/mistralai/mistral-small-2402/85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral Small (2402)",
+    "name": "Mistral Small 2402",
     "id": "mistralai/mistral-small-2402",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.54,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mixtral-8x22b/cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json b/data/helm_mmlu/mistralai/mixtral-8x22b/df568c3c-8a5c-4455-836d-c980d7f5ea5c.json
similarity index 82%
rename from data/helm_mmlu/mistralai/mixtral-8x22b/cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json
rename to data/helm_mmlu/mistralai/mixtral-8x22b/df568c3c-8a5c-4455-836d-c980d7f5ea5c.json
index 35cc50f7b..401d4b7c8 100644
--- a/data/helm_mmlu/mistralai/mixtral-8x22b/cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json
+++ b/data/helm_mmlu/mistralai/mixtral-8x22b/df568c3c-8a5c-4455-836d-c980d7f5ea5c.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mixtral (8x22B)",
+    "name": "Mixtral 8x22B",
     "id": "mistralai/mixtral-8x22b",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.598,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/96e24977-ca6d-402c-bfd8-62be4cd9b902.json
similarity index 82%
rename from data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json
rename to data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/96e24977-ca6d-402c-bfd8-62be4cd9b902.json
index 247f8572e..b88295eb7 100644
--- a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json
+++ b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/96e24977-ca6d-402c-bfd8-62be4cd9b902.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mixtral (8x7B 32K seqlen)",
+    "name": "Mixtral 8x7B 32K seqlen",
     "id": "mistralai/mixtral-8x7b-32kseqlen",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.689,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e5b2636a-8438-40c0-9f89-9f35585bf740.json
similarity index 82%
rename from data/helm_mmlu/mistralai/open-mistral-nemo-2407/87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json
rename to data/helm_mmlu/mistralai/open-mistral-nemo-2407/e5b2636a-8438-40c0-9f89-9f35585bf740.json
index 20e5d8bc5..5a436d9c9 100644
--- a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json
+++ b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e5b2636a-8438-40c0-9f89-9f35585bf740.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Mistral NeMo (2402)",
+    "name": "Mistral NeMo 2402",
     "id": "mistralai/open-mistral-nemo-2407",
     "developer": "mistralai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.215,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/48a0dd6b-9304-460a-8e4e-420c60dfa854.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/f3259d92-3c95-4b78-81ae-f7f4b80aec63.json
similarity index 82%
rename from data/helm_mmlu/openai/gpt-3.5-turbo-0125/48a0dd6b-9304-460a-8e4e-420c60dfa854.json
rename to data/helm_mmlu/openai/gpt-3.5-turbo-0125/f3259d92-3c95-4b78-81ae-f7f4b80aec63.json
index 61bdc2a92..5923a61b0 100644
--- a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/48a0dd6b-9304-460a-8e4e-420c60dfa854.json
+++ b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/f3259d92-3c95-4b78-81ae-f7f4b80aec63.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-3.5 Turbo (0125)",
+    "name": "GPT-3.5 Turbo 0125",
     "id": "openai/gpt-3.5-turbo-0125",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.493,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/5ba23a34-4232-487f-b3e9-326d776135be.json
similarity index 82%
rename from data/helm_mmlu/openai/gpt-3.5-turbo-0613/1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json
rename to data/helm_mmlu/openai/gpt-3.5-turbo-0613/5ba23a34-4232-487f-b3e9-326d776135be.json
index a7037b692..c62c20e9c 100644
--- a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json
+++ b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/5ba23a34-4232-487f-b3e9-326d776135be.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-3.5 Turbo (0613)",
+    "name": "GPT-3.5 Turbo 0613",
     "id": "openai/gpt-3.5-turbo-0613",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.589,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4-0613/8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json b/data/helm_mmlu/openai/gpt-4-0613/5bc1a462-f753-4259-91c3-a549491b2986.json
similarity index 82%
rename from data/helm_mmlu/openai/gpt-4-0613/8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json
rename to data/helm_mmlu/openai/gpt-4-0613/5bc1a462-f753-4259-91c3-a549491b2986.json
index 8a368f8b6..9877671a2 100644
--- a/data/helm_mmlu/openai/gpt-4-0613/8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json
+++ b/data/helm_mmlu/openai/gpt-4-0613/5bc1a462-f753-4259-91c3-a549491b2986.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 (0613)",
+    "name": "GPT-4 0613",
     "id": "openai/gpt-4-0613",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.517,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4-1106-preview/174ad35c-d6b5-49bd-930c-9c83608213a9.json b/data/helm_mmlu/openai/gpt-4-1106-preview/16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json
similarity index 82%
rename from data/helm_mmlu/openai/gpt-4-1106-preview/174ad35c-d6b5-49bd-930c-9c83608213a9.json
rename to data/helm_mmlu/openai/gpt-4-1106-preview/16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json
index 41438331c..448f5bbca 100644
--- a/data/helm_mmlu/openai/gpt-4-1106-preview/174ad35c-d6b5-49bd-930c-9c83608213a9.json
+++ b/data/helm_mmlu/openai/gpt-4-1106-preview/16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 Turbo (1106 preview)",
+    "name": "GPT-4 Turbo 1106 preview",
     "id": "openai/gpt-4-1106-preview",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.416,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/348bbc24-09de-4d1e-98bc-079e87fea558.json b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json
similarity index 82%
rename from data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/348bbc24-09de-4d1e-98bc-079e87fea558.json
rename to data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json
index a7796e764..aefe21734 100644
--- a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/348bbc24-09de-4d1e-98bc-079e87fea558.json
+++ b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4 Turbo (2024-04-09)",
+    "name": "GPT-4 Turbo 2024-04-09",
     "id": "openai/gpt-4-turbo-2024-04-09",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.351,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4o-2024-05-13/f37fc452-58f2-4d80-a71c-9331f7fe549e.json b/data/helm_mmlu/openai/gpt-4o-2024-05-13/2ca11d4c-52e6-49ea-a5cb-238c0313c483.json
similarity index 82%
rename from data/helm_mmlu/openai/gpt-4o-2024-05-13/f37fc452-58f2-4d80-a71c-9331f7fe549e.json
rename to data/helm_mmlu/openai/gpt-4o-2024-05-13/2ca11d4c-52e6-49ea-a5cb-238c0313c483.json
index 1572c27c7..efc7bbe5a 100644
--- a/data/helm_mmlu/openai/gpt-4o-2024-05-13/f37fc452-58f2-4d80-a71c-9331f7fe549e.json
+++ b/data/helm_mmlu/openai/gpt-4o-2024-05-13/2ca11d4c-52e6-49ea-a5cb-238c0313c483.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o (2024-05-13)",
+    "name": "GPT-4o 2024-05-13",
     "id": "openai/gpt-4o-2024-05-13",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.671,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4o-2024-08-06/71df45d2-1a27-4ff2-853c-e853f809ff52.json b/data/helm_mmlu/openai/gpt-4o-2024-08-06/de400624-6c2e-47af-b851-54c4075c30ee.json
similarity index 82%
rename from data/helm_mmlu/openai/gpt-4o-2024-08-06/71df45d2-1a27-4ff2-853c-e853f809ff52.json
rename to data/helm_mmlu/openai/gpt-4o-2024-08-06/de400624-6c2e-47af-b851-54c4075c30ee.json
index 4ba84b207..fe9568710 100644
--- a/data/helm_mmlu/openai/gpt-4o-2024-08-06/71df45d2-1a27-4ff2-853c-e853f809ff52.json
+++ b/data/helm_mmlu/openai/gpt-4o-2024-08-06/de400624-6c2e-47af-b851-54c4075c30ee.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o (2024-08-06)",
+    "name": "GPT-4o 2024-08-06",
     "id": "openai/gpt-4o-2024-08-06",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.52,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/7c049135-a8bc-46ca-9a85-cba23e8696fd.json b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/34441b3b-4d66-444c-af85-ca0666a48ed4.json
similarity index 82%
rename from data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/7c049135-a8bc-46ca-9a85-cba23e8696fd.json
rename to data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/34441b3b-4d66-444c-af85-ca0666a48ed4.json
index f69b1b3d4..681eae3b7 100644
--- a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/7c049135-a8bc-46ca-9a85-cba23e8696fd.json
+++ b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/34441b3b-4d66-444c-af85-ca0666a48ed4.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "GPT-4o mini (2024-07-18)",
+    "name": "GPT-4o mini 2024-07-18",
     "id": "openai/gpt-4o-mini-2024-07-18",
     "developer": "openai",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.774,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-110b-chat/69737d19-682b-494f-b10b-fb788e83076b.json b/data/helm_mmlu/qwen/qwen1.5-110b-chat/eecf5e40-9110-47ea-a72b-9ba587b96e30.json
similarity index 82%
rename from data/helm_mmlu/qwen/qwen1.5-110b-chat/69737d19-682b-494f-b10b-fb788e83076b.json
rename to data/helm_mmlu/qwen/qwen1.5-110b-chat/eecf5e40-9110-47ea-a72b-9ba587b96e30.json
index 190b1dce2..6667a05bb 100644
--- a/data/helm_mmlu/qwen/qwen1.5-110b-chat/69737d19-682b-494f-b10b-fb788e83076b.json
+++ b/data/helm_mmlu/qwen/qwen1.5-110b-chat/eecf5e40-9110-47ea-a72b-9ba587b96e30.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 Chat (110B)",
+    "name": "Qwen1.5 Chat 110B",
     "id": "qwen/qwen1.5-110b-chat",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.875,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-14b/c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json b/data/helm_mmlu/qwen/qwen1.5-14b/f26fb123-c214-4d18-aea8-b05b4ea1819b.json
similarity index 82%
rename from data/helm_mmlu/qwen/qwen1.5-14b/c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json
rename to data/helm_mmlu/qwen/qwen1.5-14b/f26fb123-c214-4d18-aea8-b05b4ea1819b.json
index 7ff151a72..ce5d472c6 100644
--- a/data/helm_mmlu/qwen/qwen1.5-14b/c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json
+++ b/data/helm_mmlu/qwen/qwen1.5-14b/f26fb123-c214-4d18-aea8-b05b4ea1819b.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (14B)",
+    "name": "Qwen1.5 14B",
     "id": "qwen/qwen1.5-14b",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.796,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-32b/ed668c03-e5df-4871-b2fa-876b2cda62f3.json b/data/helm_mmlu/qwen/qwen1.5-32b/30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json
similarity index 82%
rename from data/helm_mmlu/qwen/qwen1.5-32b/ed668c03-e5df-4871-b2fa-876b2cda62f3.json
rename to data/helm_mmlu/qwen/qwen1.5-32b/30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json
index 421333da5..ff8059b60 100644
--- a/data/helm_mmlu/qwen/qwen1.5-32b/ed668c03-e5df-4871-b2fa-876b2cda62f3.json
+++ b/data/helm_mmlu/qwen/qwen1.5-32b/30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (32B)",
+    "name": "Qwen1.5 32B",
     "id": "qwen/qwen1.5-32b",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.624,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-72b/c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json b/data/helm_mmlu/qwen/qwen1.5-72b/b152cd5c-cbc0-48f4-ba37-16878c3afba1.json
similarity index 82%
rename from data/helm_mmlu/qwen/qwen1.5-72b/c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json
rename to data/helm_mmlu/qwen/qwen1.5-72b/b152cd5c-cbc0-48f4-ba37-16878c3afba1.json
index d14327eec..c69a6d09c 100644
--- a/data/helm_mmlu/qwen/qwen1.5-72b/c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json
+++ b/data/helm_mmlu/qwen/qwen1.5-72b/b152cd5c-cbc0-48f4-ba37-16878c3afba1.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (72B)",
+    "name": "Qwen1.5 72B",
     "id": "qwen/qwen1.5-72b",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.65,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-7b/1c743b00-0ca6-4332-9bb6-7f62190d74e3.json b/data/helm_mmlu/qwen/qwen1.5-7b/dac223e9-3073-46f9-924b-c5a6408f5da9.json
similarity index 82%
rename from data/helm_mmlu/qwen/qwen1.5-7b/1c743b00-0ca6-4332-9bb6-7f62190d74e3.json
rename to data/helm_mmlu/qwen/qwen1.5-7b/dac223e9-3073-46f9-924b-c5a6408f5da9.json
index d9688a597..8651674c9 100644
--- a/data/helm_mmlu/qwen/qwen1.5-7b/1c743b00-0ca6-4332-9bb6-7f62190d74e3.json
+++ b/data/helm_mmlu/qwen/qwen1.5-7b/dac223e9-3073-46f9-924b-c5a6408f5da9.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen1.5 (7B)",
+    "name": "Qwen1.5 7B",
     "id": "qwen/qwen1.5-7b",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.843,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen2-72b-instruct/7f9317d3-b2bc-481d-9b28-9f305612ac58.json b/data/helm_mmlu/qwen/qwen2-72b-instruct/a7a218ff-7afe-417c-ac39-cf305d592d56.json
similarity index 82%
rename from data/helm_mmlu/qwen/qwen2-72b-instruct/7f9317d3-b2bc-481d-9b28-9f305612ac58.json
rename to data/helm_mmlu/qwen/qwen2-72b-instruct/a7a218ff-7afe-417c-ac39-cf305d592d56.json
index abb62e63a..89026d1dc 100644
--- a/data/helm_mmlu/qwen/qwen2-72b-instruct/7f9317d3-b2bc-481d-9b28-9f305612ac58.json
+++ b/data/helm_mmlu/qwen/qwen2-72b-instruct/a7a218ff-7afe-417c-ac39-cf305d592d56.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2 Instruct (72B)",
+    "name": "Qwen2 Instruct 72B",
     "id": "qwen/qwen2-72b-instruct",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.826,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/7b3bc40a-a606-419d-b784-99697c1df5bc.json b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/2e165735-43b8-4317-9cde-35aa4b5bcb26.json
similarity index 82%
rename from data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/7b3bc40a-a606-419d-b784-99697c1df5bc.json
rename to data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/2e165735-43b8-4317-9cde-35aa4b5bcb26.json
index ee06a7f3d..1a03b982a 100644
--- a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/7b3bc40a-a606-419d-b784-99697c1df5bc.json
+++ b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/2e165735-43b8-4317-9cde-35aa4b5bcb26.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2.5 Instruct Turbo (72B)",
+    "name": "Qwen2.5 Instruct Turbo 72B",
     "id": "qwen/qwen2.5-72b-instruct-turbo",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.548,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/15c25bc5-7b1e-4771-bda2-fd04d74e1463.json
similarity index 82%
rename from data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json
rename to data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/15c25bc5-7b1e-4771-bda2-fd04d74e1463.json
index f8033410f..032da16a1 100644
--- a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json
+++ b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/15c25bc5-7b1e-4771-bda2-fd04d74e1463.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Qwen2.5 Instruct Turbo (7B)",
+    "name": "Qwen2.5 Instruct Turbo 7B",
     "id": "qwen/qwen2.5-7b-instruct-turbo",
     "developer": "qwen",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.887,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/cc68185c-6ee2-40bd-8951-f104d898c7f8.json b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/26036c7c-e981-46e8-b5e9-dcd7d116af70.json
similarity index 82%
rename from data/helm_mmlu/snowflake/snowflake-arctic-instruct/cc68185c-6ee2-40bd-8951-f104d898c7f8.json
rename to data/helm_mmlu/snowflake/snowflake-arctic-instruct/26036c7c-e981-46e8-b5e9-dcd7d116af70.json
index cde071792..5482f32f0 100644
--- a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/cc68185c-6ee2-40bd-8951-f104d898c7f8.json
+++ b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/26036c7c-e981-46e8-b5e9-dcd7d116af70.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.565,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/upstage/solar-pro-241126/78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json b/data/helm_mmlu/upstage/solar-pro-241126/b3269e4e-98a7-4795-8ef3-fc87774a54b7.json
similarity index 82%
rename from data/helm_mmlu/upstage/solar-pro-241126/78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json
rename to data/helm_mmlu/upstage/solar-pro-241126/b3269e4e-98a7-4795-8ef3-fc87774a54b7.json
index 7d7fe6a40..b71ad83e6 100644
--- a/data/helm_mmlu/upstage/solar-pro-241126/78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json
+++ b/data/helm_mmlu/upstage/solar-pro-241126/b3269e4e-98a7-4795-8ef3-fc87774a54b7.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.462,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/writer/palmyra-x-004/ba74f375-fd6d-4bba-af63-605bd73c9b7f.json b/data/helm_mmlu/writer/palmyra-x-004/284fde9f-8570-4e6d-9190-e52d8723fe57.json
similarity index 82%
rename from data/helm_mmlu/writer/palmyra-x-004/ba74f375-fd6d-4bba-af63-605bd73c9b7f.json
rename to data/helm_mmlu/writer/palmyra-x-004/284fde9f-8570-4e6d-9190-e52d8723fe57.json
index c2c0d493b..734ce34f3 100644
--- a/data/helm_mmlu/writer/palmyra-x-004/ba74f375-fd6d-4bba-af63-605bd73c9b7f.json
+++ b/data/helm_mmlu/writer/palmyra-x-004/284fde9f-8570-4e6d-9190-e52d8723fe57.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.629,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/helm_mmlu/writer/palmyra-x-v3/41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json b/data/helm_mmlu/writer/palmyra-x-v3/fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json
similarity index 82%
rename from data/helm_mmlu/writer/palmyra-x-v3/41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json
rename to data/helm_mmlu/writer/palmyra-x-v3/fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json
index fd6405aa5..0e5669e0a 100644
--- a/data/helm_mmlu/writer/palmyra-x-v3/41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json
+++ b/data/helm_mmlu/writer/palmyra-x-v3/fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json
@@ -1,10 +1,7 @@
 {
-  "schema_version": "0.1.0",
-  "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1767657487.397731",
-  "retrieved_timestamp": "1767657487.397731",
-  "source_data": [
-    "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-  ],
+  "schema_version": "0.2.0",
+  "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1770830564.5477738",
+  "retrieved_timestamp": "1770830564.5477738",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -12,7 +9,7 @@
     "evaluator_relationship": "third_party"
   },
   "model_info": {
-    "name": "Palmyra X V3 (72B)",
+    "name": "Palmyra X V3 72B",
     "id": "writer/palmyra-x-v3",
     "developer": "writer",
     "inference_platform": "unknown"
@@ -20,6 +17,13 @@
   "evaluation_results": [
     {
       "evaluation_name": "MMLU All Subjects - EM",
+      "source_data": {
+        "dataset_name": "MMLU All Subjects",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -65,130 +69,139 @@
         }
       },
       "generation_config": {
-        "subject": [
-          "abstract_algebra",
-          "anatomy",
-          "astronomy",
-          "business_ethics",
-          "clinical_knowledge",
-          "college_biology",
-          "college_chemistry",
-          "college_computer_science",
-          "college_mathematics",
-          "college_medicine",
-          "college_physics",
-          "computer_security",
-          "conceptual_physics",
-          "econometrics",
-          "electrical_engineering",
-          "elementary_mathematics",
-          "formal_logic",
-          "global_facts",
-          "high_school_biology",
-          "high_school_chemistry",
-          "high_school_computer_science",
-          "high_school_european_history",
-          "high_school_geography",
-          "high_school_government_and_politics",
-          "high_school_macroeconomics",
-          "high_school_mathematics",
-          "high_school_microeconomics",
-          "high_school_physics",
-          "high_school_psychology",
-          "high_school_statistics",
-          "high_school_us_history",
-          "high_school_world_history",
-          "human_aging",
-          "human_sexuality",
-          "international_law",
-          "jurisprudence",
-          "logical_fallacies",
-          "machine_learning",
-          "management",
-          "marketing",
-          "medical_genetics",
-          "miscellaneous",
-          "moral_disputes",
-          "moral_scenarios",
-          "nutrition",
-          "philosophy",
-          "prehistory",
-          "professional_accounting",
-          "professional_law",
-          "professional_medicine",
-          "professional_psychology",
-          "public_relations",
-          "security_studies",
-          "sociology",
-          "us_foreign_policy",
-          "virology",
-          "world_religions"
-        ],
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": [
-          "mmlu_abstract_algebra",
-          "mmlu_anatomy",
-          "mmlu_astronomy",
-          "mmlu_business_ethics",
-          "mmlu_clinical_knowledge",
-          "mmlu_college_biology",
-          "mmlu_college_chemistry",
-          "mmlu_college_computer_science",
-          "mmlu_college_mathematics",
-          "mmlu_college_medicine",
-          "mmlu_college_physics",
-          "mmlu_computer_security",
-          "mmlu_conceptual_physics",
-          "mmlu_econometrics",
-          "mmlu_electrical_engineering",
-          "mmlu_elementary_mathematics",
-          "mmlu_formal_logic",
-          "mmlu_global_facts",
-          "mmlu_high_school_biology",
-          "mmlu_high_school_chemistry",
-          "mmlu_high_school_computer_science",
-          "mmlu_high_school_european_history",
-          "mmlu_high_school_geography",
-          "mmlu_high_school_government_and_politics",
-          "mmlu_high_school_macroeconomics",
-          "mmlu_high_school_mathematics",
-          "mmlu_high_school_microeconomics",
-          "mmlu_high_school_physics",
-          "mmlu_high_school_psychology",
-          "mmlu_high_school_statistics",
-          "mmlu_high_school_us_history",
-          "mmlu_high_school_world_history",
-          "mmlu_human_aging",
-          "mmlu_human_sexuality",
-          "mmlu_international_law",
-          "mmlu_jurisprudence",
-          "mmlu_logical_fallacies",
-          "mmlu_machine_learning",
-          "mmlu_management",
-          "mmlu_marketing",
-          "mmlu_medical_genetics",
-          "mmlu_miscellaneous",
-          "mmlu_moral_disputes",
-          "mmlu_moral_scenarios",
-          "mmlu_nutrition",
-          "mmlu_philosophy",
-          "mmlu_prehistory",
-          "mmlu_professional_accounting",
-          "mmlu_professional_law",
-          "mmlu_professional_medicine",
-          "mmlu_professional_psychology",
-          "mmlu_public_relations",
-          "mmlu_security_studies",
-          "mmlu_sociology",
-          "mmlu_us_foreign_policy",
-          "mmlu_virology",
-          "mmlu_world_religions"
-        ]
+        "additional_details": {
+          "subject": [
+            "abstract_algebra",
+            "anatomy",
+            "astronomy",
+            "business_ethics",
+            "clinical_knowledge",
+            "college_biology",
+            "college_chemistry",
+            "college_computer_science",
+            "college_mathematics",
+            "college_medicine",
+            "college_physics",
+            "computer_security",
+            "conceptual_physics",
+            "econometrics",
+            "electrical_engineering",
+            "elementary_mathematics",
+            "formal_logic",
+            "global_facts",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_computer_science",
+            "high_school_european_history",
+            "high_school_geography",
+            "high_school_government_and_politics",
+            "high_school_macroeconomics",
+            "high_school_mathematics",
+            "high_school_microeconomics",
+            "high_school_physics",
+            "high_school_psychology",
+            "high_school_statistics",
+            "high_school_us_history",
+            "high_school_world_history",
+            "human_aging",
+            "human_sexuality",
+            "international_law",
+            "jurisprudence",
+            "logical_fallacies",
+            "machine_learning",
+            "management",
+            "marketing",
+            "medical_genetics",
+            "miscellaneous",
+            "moral_disputes",
+            "moral_scenarios",
+            "nutrition",
+            "philosophy",
+            "prehistory",
+            "professional_accounting",
+            "professional_law",
+            "professional_medicine",
+            "professional_psychology",
+            "public_relations",
+            "security_studies",
+            "sociology",
+            "us_foreign_policy",
+            "virology",
+            "world_religions"
+          ],
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": [
+            "mmlu_abstract_algebra",
+            "mmlu_anatomy",
+            "mmlu_astronomy",
+            "mmlu_business_ethics",
+            "mmlu_clinical_knowledge",
+            "mmlu_college_biology",
+            "mmlu_college_chemistry",
+            "mmlu_college_computer_science",
+            "mmlu_college_mathematics",
+            "mmlu_college_medicine",
+            "mmlu_college_physics",
+            "mmlu_computer_security",
+            "mmlu_conceptual_physics",
+            "mmlu_econometrics",
+            "mmlu_electrical_engineering",
+            "mmlu_elementary_mathematics",
+            "mmlu_formal_logic",
+            "mmlu_global_facts",
+            "mmlu_high_school_biology",
+            "mmlu_high_school_chemistry",
+            "mmlu_high_school_computer_science",
+            "mmlu_high_school_european_history",
+            "mmlu_high_school_geography",
+            "mmlu_high_school_government_and_politics",
+            "mmlu_high_school_macroeconomics",
+            "mmlu_high_school_mathematics",
+            "mmlu_high_school_microeconomics",
+            "mmlu_high_school_physics",
+            "mmlu_high_school_psychology",
+            "mmlu_high_school_statistics",
+            "mmlu_high_school_us_history",
+            "mmlu_high_school_world_history",
+            "mmlu_human_aging",
+            "mmlu_human_sexuality",
+            "mmlu_international_law",
+            "mmlu_jurisprudence",
+            "mmlu_logical_fallacies",
+            "mmlu_machine_learning",
+            "mmlu_management",
+            "mmlu_marketing",
+            "mmlu_medical_genetics",
+            "mmlu_miscellaneous",
+            "mmlu_moral_disputes",
+            "mmlu_moral_scenarios",
+            "mmlu_nutrition",
+            "mmlu_philosophy",
+            "mmlu_prehistory",
+            "mmlu_professional_accounting",
+            "mmlu_professional_law",
+            "mmlu_professional_medicine",
+            "mmlu_professional_psychology",
+            "mmlu_public_relations",
+            "mmlu_security_studies",
+            "mmlu_sociology",
+            "mmlu_us_foreign_policy",
+            "mmlu_virology",
+            "mmlu_world_religions"
+          ]
+        }
       }
     },
     {
       "evaluation_name": "Abstract Algebra - EM",
+      "source_data": {
+        "dataset_name": "Abstract Algebra",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -234,14 +247,23 @@
         }
       },
       "generation_config": {
-        "subject": "abstract_algebra",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_abstract_algebra"
+        "additional_details": {
+          "subject": "abstract_algebra",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_abstract_algebra"
+        }
       }
     },
     {
       "evaluation_name": "Anatomy - EM",
+      "source_data": {
+        "dataset_name": "Anatomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -287,14 +309,23 @@
         }
       },
       "generation_config": {
-        "subject": "anatomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_anatomy"
+        "additional_details": {
+          "subject": "anatomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_anatomy"
+        }
       }
     },
     {
       "evaluation_name": "College Physics - EM",
+      "source_data": {
+        "dataset_name": "College Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -490,14 +521,23 @@
         }
       },
       "generation_config": {
-        "subject": "college_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_college_physics"
+        "additional_details": {
+          "subject": "college_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_college_physics"
+        }
       }
     },
     {
       "evaluation_name": "Computer Security - EM",
+      "source_data": {
+        "dataset_name": "Computer Security",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -543,14 +583,23 @@
         }
       },
       "generation_config": {
-        "subject": "computer_security",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_computer_security"
+        "additional_details": {
+          "subject": "computer_security",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_computer_security"
+        }
       }
     },
     {
       "evaluation_name": "Econometrics - EM",
+      "source_data": {
+        "dataset_name": "Econometrics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -596,14 +645,23 @@
         }
       },
       "generation_config": {
-        "subject": "econometrics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_econometrics"
+        "additional_details": {
+          "subject": "econometrics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_econometrics"
+        }
       }
     },
     {
       "evaluation_name": "Global Facts - EM",
+      "source_data": {
+        "dataset_name": "Global Facts",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -649,14 +707,23 @@
         }
       },
       "generation_config": {
-        "subject": "global_facts",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_global_facts"
+        "additional_details": {
+          "subject": "global_facts",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_global_facts"
+        }
       }
     },
     {
       "evaluation_name": "Jurisprudence - EM",
+      "source_data": {
+        "dataset_name": "Jurisprudence",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -702,14 +769,23 @@
         }
       },
       "generation_config": {
-        "subject": "jurisprudence",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_jurisprudence"
+        "additional_details": {
+          "subject": "jurisprudence",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_jurisprudence"
+        }
       }
     },
     {
       "evaluation_name": "Philosophy - EM",
+      "source_data": {
+        "dataset_name": "Philosophy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -755,14 +831,23 @@
         }
       },
       "generation_config": {
-        "subject": "philosophy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_philosophy"
+        "additional_details": {
+          "subject": "philosophy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_philosophy"
+        }
       }
     },
     {
       "evaluation_name": "Professional Psychology - EM",
+      "source_data": {
+        "dataset_name": "Professional Psychology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -898,14 +983,23 @@
         }
       },
       "generation_config": {
-        "subject": "professional_psychology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_professional_psychology"
+        "additional_details": {
+          "subject": "professional_psychology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_professional_psychology"
+        }
       }
     },
     {
       "evaluation_name": "Us Foreign Policy - EM",
+      "source_data": {
+        "dataset_name": "Us Foreign Policy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -951,14 +1045,23 @@
         }
       },
       "generation_config": {
-        "subject": "us_foreign_policy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_us_foreign_policy"
+        "additional_details": {
+          "subject": "us_foreign_policy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_us_foreign_policy"
+        }
       }
     },
     {
       "evaluation_name": "Astronomy - EM",
+      "source_data": {
+        "dataset_name": "Astronomy",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1004,14 +1107,23 @@
         }
       },
       "generation_config": {
-        "subject": "astronomy",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_astronomy"
+        "additional_details": {
+          "subject": "astronomy",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_astronomy"
+        }
       }
     },
     {
       "evaluation_name": "Business Ethics - EM",
+      "source_data": {
+        "dataset_name": "Business Ethics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1057,14 +1169,23 @@
         }
       },
       "generation_config": {
-        "subject": "business_ethics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_business_ethics"
+        "additional_details": {
+          "subject": "business_ethics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_business_ethics"
+        }
       }
     },
     {
       "evaluation_name": "Clinical Knowledge - EM",
+      "source_data": {
+        "dataset_name": "Clinical Knowledge",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1110,14 +1231,23 @@
         }
       },
       "generation_config": {
-        "subject": "clinical_knowledge",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_clinical_knowledge"
+        "additional_details": {
+          "subject": "clinical_knowledge",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_clinical_knowledge"
+        }
       }
     },
     {
       "evaluation_name": "Conceptual Physics - EM",
+      "source_data": {
+        "dataset_name": "Conceptual Physics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1163,14 +1293,23 @@
         }
       },
       "generation_config": {
-        "subject": "conceptual_physics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_conceptual_physics"
+        "additional_details": {
+          "subject": "conceptual_physics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_conceptual_physics"
+        }
       }
     },
     {
       "evaluation_name": "Electrical Engineering - EM",
+      "source_data": {
+        "dataset_name": "Electrical Engineering",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1216,14 +1355,23 @@
         }
       },
       "generation_config": {
-        "subject": "electrical_engineering",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_electrical_engineering"
+        "additional_details": {
+          "subject": "electrical_engineering",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_electrical_engineering"
+        }
       }
     },
     {
       "evaluation_name": "Elementary Mathematics - EM",
+      "source_data": {
+        "dataset_name": "Elementary Mathematics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1269,14 +1417,23 @@
         }
       },
       "generation_config": {
-        "subject": "elementary_mathematics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_elementary_mathematics"
+        "additional_details": {
+          "subject": "elementary_mathematics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_elementary_mathematics"
+        }
       }
     },
     {
       "evaluation_name": "Formal Logic - EM",
+      "source_data": {
+        "dataset_name": "Formal Logic",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1322,14 +1479,23 @@
         }
       },
       "generation_config": {
-        "subject": "formal_logic",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_formal_logic"
+        "additional_details": {
+          "subject": "formal_logic",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_formal_logic"
+        }
       }
     },
     {
       "evaluation_name": "High School World History - EM",
+      "source_data": {
+        "dataset_name": "High School World History",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1765,14 +1931,23 @@
         }
       },
       "generation_config": {
-        "subject": "high_school_world_history",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_high_school_world_history"
+        "additional_details": {
+          "subject": "high_school_world_history",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_high_school_world_history"
+        }
       }
     },
     {
       "evaluation_name": "Human Sexuality - EM",
+      "source_data": {
+        "dataset_name": "Human Sexuality",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1848,14 +2023,23 @@
         }
       },
       "generation_config": {
-        "subject": "human_sexuality",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_human_sexuality"
+        "additional_details": {
+          "subject": "human_sexuality",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_human_sexuality"
+        }
       }
     },
     {
       "evaluation_name": "International Law - EM",
+      "source_data": {
+        "dataset_name": "International Law",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1901,14 +2085,23 @@
         }
       },
       "generation_config": {
-        "subject": "international_law",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_international_law"
+        "additional_details": {
+          "subject": "international_law",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_international_law"
+        }
       }
     },
     {
       "evaluation_name": "Logical Fallacies - EM",
+      "source_data": {
+        "dataset_name": "Logical Fallacies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -1954,14 +2147,23 @@
         }
       },
       "generation_config": {
-        "subject": "logical_fallacies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_logical_fallacies"
+        "additional_details": {
+          "subject": "logical_fallacies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_logical_fallacies"
+        }
       }
     },
     {
       "evaluation_name": "Machine Learning - EM",
+      "source_data": {
+        "dataset_name": "Machine Learning",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2007,14 +2209,23 @@
         }
       },
       "generation_config": {
-        "subject": "machine_learning",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_machine_learning"
+        "additional_details": {
+          "subject": "machine_learning",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_machine_learning"
+        }
       }
     },
     {
       "evaluation_name": "Management - EM",
+      "source_data": {
+        "dataset_name": "Management",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2060,14 +2271,23 @@
         }
       },
       "generation_config": {
-        "subject": "management",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_management"
+        "additional_details": {
+          "subject": "management",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_management"
+        }
       }
     },
     {
       "evaluation_name": "Marketing - EM",
+      "source_data": {
+        "dataset_name": "Marketing",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2113,14 +2333,23 @@
         }
       },
       "generation_config": {
-        "subject": "marketing",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_marketing"
+        "additional_details": {
+          "subject": "marketing",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_marketing"
+        }
       }
     },
     {
       "evaluation_name": "Medical Genetics - EM",
+      "source_data": {
+        "dataset_name": "Medical Genetics",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2166,14 +2395,23 @@
         }
       },
       "generation_config": {
-        "subject": "medical_genetics",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_medical_genetics"
+        "additional_details": {
+          "subject": "medical_genetics",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_medical_genetics"
+        }
       }
     },
     {
       "evaluation_name": "Miscellaneous - EM",
+      "source_data": {
+        "dataset_name": "Miscellaneous",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2219,14 +2457,23 @@
         }
       },
       "generation_config": {
-        "subject": "miscellaneous",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_miscellaneous"
+        "additional_details": {
+          "subject": "miscellaneous",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_miscellaneous"
+        }
       }
     },
     {
       "evaluation_name": "Moral Scenarios - EM",
+      "source_data": {
+        "dataset_name": "Moral Scenarios",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2302,14 +2549,23 @@
         }
       },
       "generation_config": {
-        "subject": "moral_scenarios",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_moral_scenarios"
+        "additional_details": {
+          "subject": "moral_scenarios",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_moral_scenarios"
+        }
       }
     },
     {
       "evaluation_name": "Nutrition - EM",
+      "source_data": {
+        "dataset_name": "Nutrition",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2355,14 +2611,23 @@
         }
       },
       "generation_config": {
-        "subject": "nutrition",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_nutrition"
+        "additional_details": {
+          "subject": "nutrition",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_nutrition"
+        }
       }
     },
     {
       "evaluation_name": "Prehistory - EM",
+      "source_data": {
+        "dataset_name": "Prehistory",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2408,14 +2673,23 @@
         }
       },
       "generation_config": {
-        "subject": "prehistory",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_prehistory"
+        "additional_details": {
+          "subject": "prehistory",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_prehistory"
+        }
       }
     },
     {
       "evaluation_name": "Public Relations - EM",
+      "source_data": {
+        "dataset_name": "Public Relations",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2461,14 +2735,23 @@
         }
       },
       "generation_config": {
-        "subject": "public_relations",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_public_relations"
+        "additional_details": {
+          "subject": "public_relations",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_public_relations"
+        }
       }
     },
     {
       "evaluation_name": "Security Studies - EM",
+      "source_data": {
+        "dataset_name": "Security Studies",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2514,14 +2797,23 @@
         }
       },
       "generation_config": {
-        "subject": "security_studies",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_security_studies"
+        "additional_details": {
+          "subject": "security_studies",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_security_studies"
+        }
       }
     },
     {
       "evaluation_name": "Sociology - EM",
+      "source_data": {
+        "dataset_name": "Sociology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2567,14 +2859,23 @@
         }
       },
       "generation_config": {
-        "subject": "sociology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_sociology"
+        "additional_details": {
+          "subject": "sociology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_sociology"
+        }
       }
     },
     {
       "evaluation_name": "Virology - EM",
+      "source_data": {
+        "dataset_name": "Virology",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2620,14 +2921,23 @@
         }
       },
       "generation_config": {
-        "subject": "virology",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_virology"
+        "additional_details": {
+          "subject": "virology",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_virology"
+        }
       }
     },
     {
       "evaluation_name": "World Religions - EM",
+      "source_data": {
+        "dataset_name": "World Religions",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
         "lower_is_better": false,
@@ -2673,14 +2983,23 @@
         }
       },
       "generation_config": {
-        "subject": "world_religions",
-        "method": "multiple_choice_joint",
-        "eval_split": "test",
-        "groups": "mmlu_world_religions"
+        "additional_details": {
+          "subject": "world_religions",
+          "method": "multiple_choice_joint",
+          "eval_split": "test",
+          "groups": "mmlu_world_religions"
+        }
       }
     },
     {
       "evaluation_name": "Mean win rate",
+      "source_data": {
+        "dataset_name": "helm_mmlu",
+        "source_type": "url",
+        "url": [
+          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+        ]
+      },
       "metric_config": {
         "evaluation_description": "How many models this model outperforms on average (over columns).",
         "lower_is_better": false,
@@ -2691,11 +3010,12 @@
       "score_details": {
         "score": 0.325,
         "details": {
-          "description": null,
           "tab": "Efficiency"
         }
       },
-      "generation_config": {}
+      "generation_config": {
+        "additional_details": {}
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/scripts/HELM/parse_helm_leaderboards.sh b/scripts/HELM/parse_helm_leaderboards.sh
new file mode 100755
index 000000000..a89a1a64e
--- /dev/null
+++ b/scripts/HELM/parse_helm_leaderboards.sh
@@ -0,0 +1,9 @@
+uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Capabilities --source_data_url https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json
+
+uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Lite --source_data_url https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json
+
+uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Classic --source_data_url https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json 
+
+uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Instruct --source_data_url https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json
+
+uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_MMLU --source_data_url https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json
\ No newline at end of file
diff --git a/utils/helm/adapter.py b/utils/helm/adapter.py
index 3297cfac9..acb5330d9 100644
--- a/utils/helm/adapter.py
+++ b/utils/helm/adapter.py
@@ -22,10 +22,12 @@
     EvaluationLog,
     EvaluationResult,
     EvaluatorRelationship,
+    GenerationConfig,
     MetricConfig,
     ModelInfo,
     ScoreDetails,
     ScoreType,
+    SourceDataUrl
 )
 
 import sys
@@ -114,7 +116,7 @@ def extract_model_info_from_row(row: List[Dict[str, Any]], model_name: str) -> T
     else:
         spec = run_spec_names[0]
         args = spec.split(":", 1)[1].split(",")
-
+        
         model_details = next(
             (arg.split("=", 1)[1] for arg in args if arg.startswith("model=")),
             "",
@@ -126,12 +128,14 @@ def extract_model_info_from_row(row: List[Dict[str, Any]], model_name: str) -> T
     if developer == "unknown":
         developer = get_developer(model_name)
 
-    return make_model_info(
+    model_info = make_model_info(
         model_name=model_name,
         developer=developer,
         inference_platform="unknown",
-    ), model_id
+    )
+    model_info.id = model_id
 
+    return model_info
 
 def find_column_ranges(tab_rows: List[List[Dict[str, Any]]]):
     """Determine min/max values for each metric column."""
@@ -152,7 +156,6 @@ def find_column_ranges(tab_rows: List[List[Dict[str, Any]]]):
 def convert(
     leaderboard_name: str,
     leaderboard_data: List[Dict[str, Any]],
-    source_data: List[str],
 ):
     """Convert HELM leaderboard data into unified evaluation logs."""
     retrieved_timestamp = str(time.time())
@@ -172,9 +175,9 @@ def convert(
             model_name = row[0].get("value")
 
             if model_name not in model_infos:
-                model_info, model_id = extract_model_info_from_row(row, model_name)
+                model_info = extract_model_info_from_row(row, model_name)
                 model_infos[model_name] = model_info
-                model_ids[model_name] = model_id
+                model_ids[model_name] = model_info.id
 
             for col_idx, (header, cell) in enumerate(zip(headers[1:], row[1:])):
                 full_eval_name = header.get("value")
@@ -203,6 +206,17 @@ def convert(
                         score_type=ScoreType.continuous,
                     )
 
+                    if full_eval_name.lower().startswith('mean'):
+                        dataset_name = leaderboard_name
+                    else:
+                        dataset_name = full_eval_name.split(' - ')[0]
+
+                    source_data = SourceDataUrl(
+                        dataset_name=dataset_name,
+                        source_type='url',
+                        url=[args.source_data_url]
+                    )
+
                     generation_config = (
                         extract_generation_config(cell.get("run_spec_names", []))
                         if cell.get("run_spec_names")
@@ -211,6 +225,7 @@ def convert(
 
                     model_results[model_name][short_name] = EvaluationResult(
                         evaluation_name=full_eval_name,
+                        source_data=source_data,
                         metric_config=metric_config,
                         score_details=ScoreDetails(
                             score=round(cell.get("value"), 3)
@@ -221,7 +236,9 @@ def convert(
                                 "tab": tab_name,
                             },
                         ),
-                        generation_config=generation_config,
+                        generation_config=GenerationConfig(
+                            additional_details=generation_config
+                        )
                     )
                 else:
                     # Add extra score details under the same metric
@@ -232,12 +249,16 @@ def convert(
                         else f"{full_eval_name} - {tab_name}"
                     )
 
-                    existing.score_details.details[detail_key] = {
-                        "description": cell.get("description"),
-                        "tab": tab_name,
-                        "score": cell.get("value"),
-                    }
-
+                    setattr(
+                        existing.score_details.details,
+                        detail_key,
+                        {
+                            "description": cell.get("description"),
+                            "tab": tab_name,
+                            "score": cell.get("value"),
+                        }
+                    )
+                
     # Save evaluation logs
     for model_name, results_by_metric in model_results.items():
         model_info = model_infos[model_name]
@@ -250,7 +271,7 @@ def convert(
         )
 
         eval_log = EvaluationLog(
-            schema_version="0.1.0",
+            schema_version="0.2.0",
             evaluation_id=evaluation_id,
             retrieved_timestamp=retrieved_timestamp,
             source_metadata=make_source_metadata(
@@ -259,7 +280,6 @@ def convert(
                 evaluator_relationship=EvaluatorRelationship.third_party,
             ),
             model_info=model_info,
-            source_data=source_data,
             evaluation_results=list(results_by_metric.values()),
         )
 
@@ -287,15 +307,13 @@ def convert(
     args = parse_args()
 
     leaderboard_name = args.leaderboard_name.lower()
-    source_data = [args.source_data_url]
 
     print(f"Fetching {leaderboard_name} data from {args.source_data_url}")
-    leaderboard_data = fetch_json(source_data[0])
+    leaderboard_data = fetch_json(args.source_data_url)
 
     convert(
         leaderboard_name=leaderboard_name,
-        leaderboard_data=leaderboard_data,
-        source_data=source_data,
+        leaderboard_data=leaderboard_data
     )
 
     print("Done!")

From b77590228e88dde571bf32017cdf923e787a8c1b Mon Sep 17 00:00:00 2001
From: Damian Stachura <damian.stachura@evidenceprime.com>
Date: Wed, 11 Feb 2026 19:53:47 +0100
Subject: [PATCH 2/2] Fix naming conventions

---
 ...8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json} |  24 +-
 ...7d2d1dba-1b31-47b2-8308-f2668cf36c99.json} |  24 +-
 ...3a056f7b-1bdf-4543-9e67-1101ace67179.json} |  24 +-
 ...275cf2e5-5ccd-40be-be55-938c82ef6688.json} |  24 +-
 ...43e7be99-4872-4eb1-b30b-75c44b298ab4.json} |  24 +-
 ...cfc99298-4570-48cf-9187-aa0d167cc0ba.json} |  24 +-
 ...a2162367-d16d-4274-aa89-43435cea5c0b.json} |  24 +-
 ...51ef4580-da13-415a-a37f-45e2036ed4c2.json} |  24 +-
 ...3fa605db-fcff-4f05-9398-6af77c9dcada.json} |  24 +-
 ...9d58ac39-fef7-47c8-920a-8be2069f5662.json} |  24 +-
 ...dd9b10af-ad39-45ef-8f91-097340d376c7.json} |  24 +-
 ...30a6de14-c57c-483e-92e9-26fc4c7f4772.json} |  24 +-
 ...bed1a799-77a6-40a1-9f37-d54fe9d4d055.json} |  24 +-
 ...6c226cad-23f1-4c09-8038-eb7b776cdee4.json} |  24 +-
 ...98887061-09d6-44ba-9cff-0267045a26ef.json} |  24 +-
 ...6693f0e2-3514-413d-be61-d10f7372b3dc.json} |  24 +-
 ...ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json} |  24 +-
 ...0d9a856d-01bf-4a82-9872-33d561cf4a57.json} |  24 +-
 ...3ff2ab7d-2c0f-4313-8223-8f514fde595a.json} |  24 +-
 ...2a46e8da-1996-428c-b567-cd0287b29d9f.json} |  24 +-
 ...30a92593-398e-4c2f-8be7-455be166aeaf.json} |  24 +-
 ...e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json} |  24 +-
 ...dfc2717d-ead8-4287-885e-5e0fc09c35e3.json} |  24 +-
 ...e97292eb-7031-4a3a-a415-44c137898e3f.json} |  24 +-
 ...4263a6be-9640-40a1-8881-768624949d47.json} |  24 +-
 ...a808cecf-8925-428f-99ea-b6c2f8bce96e.json} |  24 +-
 ...55e44a3b-1fac-4ad5-b25e-85702f33883d.json} |  24 +-
 ...5b5b339b-7631-4b77-ac51-df49d3e946eb.json} |  24 +-
 ...eaec6d66-6da7-4592-baca-2539240acc5d.json} |  24 +-
 ...2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json} |  24 +-
 ...eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json} |  24 +-
 ...75b5943a-67be-4b2f-85da-a52533edc76f.json} |  24 +-
 ...8bec35b7-271a-457d-b665-9f69baa248aa.json} |  24 +-
 ...c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json} |  24 +-
 ...c308b0a5-4c44-4369-9b23-8664959aa927.json} |  24 +-
 ...1a1edfb2-f0f1-4930-82c0-99293ec76645.json} |  24 +-
 ...9aa5af51-8c55-4896-b634-162a9d82b58e.json} |  24 +-
 ...21461a52-2f25-48c9-be19-f9233317d817.json} |  24 +-
 ...bdea0967-fcc7-493c-a18d-70727842deb9.json} |  24 +-
 ...f7404ea3-62c7-47fc-9106-44c208470381.json} |  24 +-
 ...2817820c-4b28-4235-a8fd-ad02d0f504bc.json} |  24 +-
 ...f3da71fc-fc88-4dda-b423-168d11eab317.json} |  24 +-
 ...2f7c0db9-b5de-4674-a130-5315520dea68.json} |  24 +-
 ...4dcb8022-fe54-42f7-b43f-9866de173731.json} |  24 +-
 ...c436f3d1-84ee-49df-9287-0305925f7cf4.json} |  24 +-
 ...90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json} |  24 +-
 ...07c823ba-9e17-47e4-858b-a1f2a514a276.json} |  24 +-
 ...eb1bb443-71ad-4b79-8308-2b66c5e8c631.json} |  24 +-
 ...e14d42a9-9639-4c35-8a0c-e395e754c46c.json} |  24 +-
 ...3754df44-ddce-4a66-9074-f65f5677ae27.json} |  24 +-
 ...a540b282-e9d6-403e-96df-a1d27ad14d3a.json} |  24 +-
 ...758851b3-9ac9-43d8-8b6a-3d9688752d80.json} |  24 +-
 ...1d9ac688-ca0d-405b-a262-e95673e79250.json} |  24 +-
 ...c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json} |  24 +-
 ...35a31e19-2ef5-4caa-a848-422af42adab8.json} |  24 +-
 ...7de0bda2-ce56-444a-b293-a310a5b2d7ab.json} |  24 +-
 ...dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json} |  24 +-
 ...9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json} |  24 +-
 ...07763926-3a19-43f9-a23f-095f6cb78799.json} |  24 +-
 ...56e024b3-c963-4172-9f52-7605276b3854.json} |  24 +-
 ...6f660e47-1d86-473d-9864-208111dcea31.json} |  24 +-
 ...91ef1f96-a708-4c53-ac9d-208ef3420668.json} |  24 +-
 ...c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json} |  24 +-
 ...505c6245-88d1-4557-9e34-63a4e8086210.json} |  24 +-
 ...9a473236-f187-4926-ae8a-e8b84fe2a060.json} |  24 +-
 ...1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json} |  24 +-
 ...aeabfb59-74db-445c-9693-7a088ac5073c.json} |  24 +-
 ...eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json} |  24 +-
 ...12fdea65-94eb-4c85-876c-65f0528bde12.json} |  60 ++---
 ...d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json} |  60 ++---
 ...1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json} |  60 ++---
 ...deddbc80-70ac-43e7-b052-753d127f8390.json} |  60 ++---
 ...e4780862-bf3c-4856-b1e7-02616afe931a.json} |  60 ++---
 ...cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json} |  60 ++---
 ...13a22d40-f274-4384-adcc-1539da821c6a.json} |  60 ++---
 ...a01f642e-730b-461d-8afe-9c077ab3f149.json} |  60 ++---
 ...813802a3-483e-443d-9e49-7cd581b5ea6d.json} |  60 ++---
 ...90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json} |  60 ++---
 ...d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json} |  60 ++---
 ...3dc29785-a884-4496-a6f4-a8bf19892e50.json} |  60 ++---
 ...ff8dc291-bbaf-4149-854e-e1780b0c86d5.json} |  60 ++---
 ...b8932181-b669-4b0e-8879-1dfbf9afea12.json} |  60 ++---
 ...c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json} |  60 ++---
 ...579fb908-3c36-4ff8-a262-fd5388806b83.json} |  60 ++---
 ...68ff9f10-0357-4ea8-b758-de6c7f51d669.json} |  60 ++---
 ...b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json} |  60 ++---
 ...8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json} |  60 ++---
 ...8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json} |  60 ++---
 ...6bbe052f-46f7-4541-80a3-dbb86433db7a.json} |  60 ++---
 ...9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json} |  60 ++---
 ...742a59e8-c813-42ef-938a-4897e25dcdad.json} |  60 ++---
 ...5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json} |  60 ++---
 ...509360bc-86f5-49dc-899c-2899d8b6bc6c.json} |  60 ++---
 ...8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json} |  60 ++---
 ...8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json} |  60 ++---
 ...7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json} |  60 ++---
 ...d65d8f48-8b8e-4ec6-af68-f61af5408adf.json} |  60 ++---
 ...dff69882-cb8b-4323-b587-60f295085459.json} |  60 ++---
 ...90220411-5e4d-4b74-a74c-ca2ad030d50e.json} |  60 ++---
 ...8c2465b2-deca-476c-bb41-836685ceab35.json} |  60 ++---
 ...4b0f6a03-1054-4047-82d1-53992f0378ee.json} |  60 ++---
 ...78bc128a-6e53-4086-9498-2b3428e1d884.json} |  60 ++---
 ...2be7887e-6c91-437c-bbfc-8b68de3330da.json} |  60 ++---
 ...f135ce21-655f-4ebf-9cc6-d83ada0f177b.json} |  60 ++---
 ...48912a61-af54-4208-b36d-2f3a283e5c5d.json} |  60 ++---
 ...cc85315f-4472-4b22-9f0a-e4609676ce13.json} |  60 ++---
 ...ab773619-db5e-449b-8d6b-da743cb038bb.json} |  60 ++---
 ...5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json} |  60 ++---
 ...32cc2aa3-be26-41bd-8124-a8b1073c84c4.json} |  60 ++---
 ...42a86a4a-7e76-4c7d-af48-e765a38df589.json} |  60 ++---
 ...f9746ed1-887f-4850-ac2d-700de18acbaf.json} |  60 ++---
 ...899521d0-e5eb-4e1b-af5a-78b3bd32e232.json} |  60 ++---
 ...1fb2c6db-2495-4609-a96b-57815c579953.json} |  60 ++---
 ...a5b6cc8b-676d-4c19-8093-0b893937e3d4.json} |  60 ++---
 ...0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json} |  60 ++---
 ...bc207557-fb49-4a87-8401-22c3ce853e7c.json} |  60 ++---
 ...895266ee-71a5-4ca5-b3f9-62df6383ff95.json} |  60 ++---
 ...8828e9e8-5716-41b4-a2d1-233bb056dc32.json} |  60 ++---
 ...f267ba72-b239-4126-99c5-675f79b1ae95.json} |  60 ++---
 ...f386e763-8078-454b-bd14-32b106663d53.json} |  60 ++---
 ...a4739cda-028b-48e0-b3b5-ca9b583d03f5.json} |  60 ++---
 ...837e20ff-fed1-4431-b643-63b904055c66.json} |  60 ++---
 ...e411f017-22c6-4d49-9bf9-5d99c1091791.json} |  60 ++---
 ...7bd2b266-5a65-4c63-bf18-5e4114564bfc.json} |  60 ++---
 ...49a1423e-d5f4-4665-b81e-d491f492a316.json} |  60 ++---
 ...8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json} |  60 ++---
 ...ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json} |  60 ++---
 ...a2b4ed40-b04f-481f-986b-25a2c26bbb79.json} |  60 ++---
 ...e88f9163-5334-43ed-9b56-154bf543f898.json} |  60 ++---
 ...6d436bd5-9d49-4895-8c07-7814b2eef12c.json} |  60 ++---
 ...681d0d6d-de06-4b8e-a7e2-964d98e2806e.json} |  60 ++---
 ...e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json} |  60 ++---
 ...cb80bd5f-204a-4dd8-96ec-40c7df93975f.json} |  60 ++---
 ...f84f84a8-7191-42ac-8951-5d7141a0f700.json} |  60 ++---
 ...9ba74767-b675-460a-bb68-e82adb6acd2f.json} |  60 ++---
 ...e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json} |  28 +--
 ...60724488-914d-4efe-98d6-f3ff26fe8fbc.json} |  28 +--
 ...2aaae404-b510-41e0-9a4a-b2d053731454.json} |  28 +--
 ...053badb4-b50a-434a-909c-c4d939c00b4e.json} |  28 +--
 ...7b4a4c6d-e302-4010-a099-5b01c874ffe8.json} |  40 ++--
 ...db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json} |  40 ++--
 ...f6808908-79d9-4de5-8434-94e4bdb854f2.json} |  40 ++--
 ...1a039ef6-5957-4246-82b2-bc607b6554e7.json} |  40 ++--
 ...fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json} |  40 ++--
 ...0e2790d3-40f1-4124-ba41-b65bd9de1852.json} |  40 ++--
 ...d55129d3-4eae-4009-a897-fa1624cea6a2.json} |  40 ++--
 ...6332f0b3-7fab-41ed-a8da-46b142051377.json} |  40 ++--
 ...0cb33741-ca10-40f5-90d3-28e300901ad3.json} |  40 ++--
 ...80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json} |  40 ++--
 ...de41775f-f60e-481e-a8ef-3df9a9b65a5a.json} |  40 ++--
 ...bc29d5c6-b5c8-473b-b69c-054026829089.json} |  40 ++--
 ...ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json} |  40 ++--
 ...4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json} |  40 ++--
 ...9ef56d5a-de00-4d89-930c-a4c74211dd78.json} |  40 ++--
 ...5598d3ed-5b37-4aec-b186-0b16c394633b.json} |  40 ++--
 ...a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json} |  40 ++--
 ...54bac699-aa82-4133-8c10-c6510c2a7f95.json} |  40 ++--
 ...79b23601-3148-4256-88ce-67e439a87c5b.json} |  40 ++--
 ...e92648e4-75c6-4944-9ec1-880823fefc87.json} |  40 ++--
 ...449feffd-d2e3-4a08-ad69-b8ad522532ae.json} |  40 ++--
 ...d297b253-0f4f-4caf-864b-9f457ab589da.json} |  40 ++--
 ...d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json} |  40 ++--
 ...cb409208-034d-42fd-acce-ab5cc4227383.json} |  40 ++--
 ...b2572ef8-446a-45b4-b557-45736418753b.json} |  40 ++--
 ...70d85516-b710-4b27-b664-03a6a822773b.json} |  40 ++--
 ...a8208df4-eb37-47d2-8845-f821e80e9858.json} |  40 ++--
 ...22cde248-40ab-43b0-a408-6d8b84692f22.json} |  40 ++--
 ...b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json} |  40 ++--
 ...ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json} |  40 ++--
 ...8721a15b-9102-4b1a-bde8-e5371f00f1b5.json} |  40 ++--
 ...23b3a30c-8aa3-4684-be54-adae003720fc.json} |  40 ++--
 ...7022c444-d6b8-4374-be0c-14835e5fd281.json} |  40 ++--
 ...bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json} |  40 ++--
 ...bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json} |  40 ++--
 ...527418d0-2591-43c9-b639-17328292b110.json} |  40 ++--
 ...8ddc465f-4f2d-4213-81c4-70b584d48047.json} |  40 ++--
 ...eca63d17-7fc2-4722-8bb3-0be99a257100.json} |  40 ++--
 ...e40a10b3-e682-4715-b2ee-4efcae050a58.json} |  40 ++--
 ...56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json} |  40 ++--
 ...f47ca10d-cd45-485e-b9cf-0c6592d63656.json} |  40 ++--
 ...7f0e318e-31bf-4044-bffb-357c1238d4fd.json} |  40 ++--
 ...818d6d72-0b5c-4fcf-b808-1d186223301e.json} |  40 ++--
 ...f09b853b-dbbc-4252-a0f0-a2c45c29f670.json} |  40 ++--
 ...f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json} |  40 ++--
 ...83c6a723-87a0-43d4-968e-86d186578e9e.json} |  40 ++--
 ...daaf221b-1759-4619-91fb-938e81975787.json} |  40 ++--
 ...6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json} |  40 ++--
 ...1043b815-b247-4444-bf8c-0b92b793c57f.json} |  40 ++--
 ...28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json} |  40 ++--
 ...73dedd31-7d40-4ee6-994d-00eb7d656597.json} |  40 ++--
 ...18da1dfa-5366-477b-a9cf-af29c5a99b68.json} |  40 ++--
 ...80057cc1-45ab-4976-878e-be963eaa83b1.json} |  40 ++--
 ...d896249f-bbd9-4657-a5db-5968544cb5fa.json} |  40 ++--
 ...9f73f3e5-b573-45d4-8c98-82f5c496f786.json} |  40 ++--
 ...a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json} |  40 ++--
 ...4ff688da-61a0-43ce-9c2d-e1c197887683.json} |  40 ++--
 ...181003ea-7587-4c93-8b89-c5c76958313d.json} |  40 ++--
 ...66688228-e59a-4caa-b3fb-c5df1efc9db4.json} |  40 ++--
 ...2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json} |  40 ++--
 ...077fe37f-b3a4-483a-93a5-034c6445fe98.json} |  40 ++--
 ...4fbb173c-b900-4e11-87bd-1ac6a489d014.json} |  40 ++--
 ...e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json} |  40 ++--
 ...0925f9b7-08f8-485f-84bc-a153a54aa417.json} |  40 ++--
 ...08082277-8305-4007-97cd-88202fc0115c.json} |  40 ++--
 ...fe554cbd-2480-40bd-b2f5-464cad700c14.json} |  40 ++--
 ...9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json} |  40 ++--
 ...d9654997-1d3e-41c3-9f16-05a36dde9b02.json} |  40 ++--
 ...73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json} |  40 ++--
 ...4d01d929-b5e2-42dc-89ee-20560f560db5.json} |  40 ++--
 ...76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json} |  40 ++--
 ...69ea0ef0-c136-4cff-9607-6ae12e0692c3.json} |  40 ++--
 ...bbe708f3-fb78-49e9-876d-cae57f1231cc.json} |  40 ++--
 ...ab7b7951-0792-4538-8a7a-6baee8602cbb.json} |  40 ++--
 ...fc94c95d-9678-4f23-b82f-190a08ece307.json} |  40 ++--
 ...3f92e2fc-9831-4c2c-b94e-af33d457fa82.json} |  40 ++--
 ...3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json} |  40 ++--
 ...6b2891bd-2444-4286-8ccf-c91181856d29.json} |  40 ++--
 ...bd924bd3-e13c-48e0-b339-8c15c5072038.json} |  40 ++--
 ...b8a6f32a-9904-43bb-9add-89404093a9db.json} |  40 ++--
 ...c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json} |  40 ++--
 ...9c1fc50a-437d-458b-926c-33cabdcc4aeb.json} |  40 ++--
 ...5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json} |  40 ++--
 ...10e1abfa-83de-4960-8d4c-c5099894cb80.json} |  40 ++--
 ...40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json} |  40 ++--
 ...2abf3bb8-a78f-4a59-807e-52da4e6426fd.json} |  40 ++--
 ...ae28615a-b7fa-4782-89e1-4b8e4804dc62.json} |  40 ++--
 ...52bb6ab9-e80b-4bf0-a375-7706f16d311d.json} |  40 ++--
 ...fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json} |  40 ++--
 ...1158720a-9a0e-492e-a677-9b0936f4cde5.json} |  40 ++--
 ...254ded81-4051-420d-b402-2e7b80a23848.json} |  40 ++--
 ...ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json} | 214 +++++++++---------
 ...7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json} | 214 +++++++++---------
 ...5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json} | 214 +++++++++---------
 ...0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json} | 214 +++++++++---------
 ...92e0b1b9-c167-4e07-b770-2b78527eb4eb.json} | 214 +++++++++---------
 ...3da06ad4-0770-45f5-a6a2-9ef9500cef05.json} | 214 +++++++++---------
 ...c1c79360-60bd-4f5d-a746-e0411b94f69b.json} | 214 +++++++++---------
 ...bb904716-048c-4b41-9f64-4d17c485afe3.json} | 214 +++++++++---------
 ...063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json} | 214 +++++++++---------
 ...c8949c55-8987-4ed3-b74b-8b13b4381806.json} | 214 +++++++++---------
 ...ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json} | 214 +++++++++---------
 ...bc9cedd7-5cb2-44b2-abda-470322570e14.json} | 214 +++++++++---------
 ...305a7f25-6e22-4146-9678-6a687a701567.json} | 214 +++++++++---------
 ...c6059976-85a1-40ce-b02f-67e182aa2f7d.json} | 214 +++++++++---------
 ...6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json} | 214 +++++++++---------
 ...f397ca7a-41c4-4926-b075-2523639f0a50.json} | 214 +++++++++---------
 ...acdf4701-e1c2-4867-bd85-d34ae8fb0991.json} | 214 +++++++++---------
 ...3cd855af-9679-4fd0-bc3f-34db697c7855.json} | 214 +++++++++---------
 ...78fb6814-e32f-4b15-b958-9e001637ba07.json} | 214 +++++++++---------
 ...f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json} | 214 +++++++++---------
 ...cefc3b25-0779-4fb3-93a5-3c7a285304af.json} | 214 +++++++++---------
 ...7e00e082-0e79-45e0-b0ff-5458cc2aff85.json} | 214 +++++++++---------
 ...ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json} | 214 +++++++++---------
 ...c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json} | 214 +++++++++---------
 ...7ea5b404-d98f-4282-81d8-6ca5f6629429.json} | 214 +++++++++---------
 ...7056c7e7-f68a-4764-aa48-a8368ae2e317.json} | 214 +++++++++---------
 ...5e67014d-6ca1-4e65-a85a-84d91e147d4d.json} | 214 +++++++++---------
 ...3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json} | 214 +++++++++---------
 ...46d5e547-507e-4c98-98a9-bad1bfad7f7b.json} | 214 +++++++++---------
 ...ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json} | 214 +++++++++---------
 ...2b31b441-caa9-465c-a2d2-051c951c7be3.json} | 214 +++++++++---------
 ...b7ea6c93-af70-4c0f-ba50-03a539416a8b.json} | 214 +++++++++---------
 ...fe4cec30-e483-49a8-80ea-00b2c6231740.json} | 214 +++++++++---------
 ...53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json} | 214 +++++++++---------
 ...af88b02d-cb29-4d2c-bb33-5fddcf316a95.json} | 214 +++++++++---------
 ...a0abcd19-58a1-478a-9786-d044a4181241.json} | 214 +++++++++---------
 ...95eda13a-cd34-4170-b2db-f2ead47250f9.json} | 214 +++++++++---------
 ...7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json} | 214 +++++++++---------
 ...9da7439c-e96b-444f-b4fa-7ef638080740.json} | 214 +++++++++---------
 ...294b22a0-1676-4d8c-8ad2-5cdc40267255.json} | 214 +++++++++---------
 ...1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json} | 214 +++++++++---------
 ...78f2484e-bc73-4026-929b-db345e92cf5a.json} | 214 +++++++++---------
 ...8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json} | 214 +++++++++---------
 ...41af381a-3637-4578-a582-59d9b1327d95.json} | 214 +++++++++---------
 ...96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json} | 214 +++++++++---------
 ...bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json} | 214 +++++++++---------
 ...e036de72-b425-4aa5-9448-dc52560e60db.json} | 214 +++++++++---------
 ...65423181-18f1-4296-98c2-171356106404.json} | 214 +++++++++---------
 ...41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json} | 214 +++++++++---------
 ...f78d6e0a-a397-4a41-a37e-696bda5a1987.json} | 214 +++++++++---------
 ...d2bf70ce-341f-49d7-bd03-87b523826953.json} | 214 +++++++++---------
 ...b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json} | 214 +++++++++---------
 ...08590b6e-7050-413d-844b-1f3f1c5aa444.json} | 214 +++++++++---------
 ...2d18fd88-73b5-4d4c-a1cc-e66a20316605.json} | 214 +++++++++---------
 ...567918be-be6f-4e41-b613-727828fe8a44.json} | 214 +++++++++---------
 ...c2be131b-808c-4947-b24f-69ef6af499d7.json} | 214 +++++++++---------
 ...24955250-a2e9-475f-a866-30a835579e03.json} | 214 +++++++++---------
 ...de6f7e19-b54a-4bd3-b624-29f66afbee15.json} | 214 +++++++++---------
 ...e4c3032d-04e0-414b-a7e9-e30756d82000.json} | 214 +++++++++---------
 ...e9a41d4b-56c7-47f0-b439-72ad1e463000.json} | 214 +++++++++---------
 ...a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json} | 214 +++++++++---------
 ...fd6aea24-dc18-41ce-bc19-23f461a39032.json} | 214 +++++++++---------
 ...625d33ce-a320-4bfd-a962-451b8c22d392.json} | 214 +++++++++---------
 ...e51be257-610e-4d38-b58a-a3b29fc06a83.json} | 214 +++++++++---------
 ...9e0b9f48-f913-4bbe-a135-59e596c9e479.json} | 214 +++++++++---------
 ...189e6cc5-1c8f-4712-8dda-c108f18f836d.json} | 214 +++++++++---------
 ...4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json} | 214 +++++++++---------
 ...ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json} | 214 +++++++++---------
 ...fa6a6772-671b-402e-9480-d61e0fb4a61e.json} | 214 +++++++++---------
 ...b5279e94-ae7f-4671-9315-874e162a24fd.json} | 214 +++++++++---------
 ...de00e8da-9c83-40df-b642-b94719ce1ac2.json} | 214 +++++++++---------
 ...119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json} | 214 +++++++++---------
 ...80aabdf4-60b7-493b-98d8-1854f1c41c10.json} | 214 +++++++++---------
 ...29958cee-32c9-4d51-8f14-72db4273459f.json} | 214 +++++++++---------
 ...72537b16-feda-4e5e-a477-f415650db847.json} | 214 +++++++++---------
 ...7df68af5-667a-4125-9c12-e71fb5af0a74.json} | 214 +++++++++---------
 ...1845eb8b-4c94-4d22-8771-012f7230dc62.json} | 214 +++++++++---------
 ...b2c8cfd1-f09a-4616-8038-c7e1930bce74.json} | 214 +++++++++---------
 ...12976629-cefe-4329-b974-bb17f88d385d.json} | 214 +++++++++---------
 utils/helm/adapter.py                         |  24 +-
 310 files changed, 13172 insertions(+), 13162 deletions(-)
 rename data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/{bd982107-7c03-4ee8-8a38-782d68883818.json => 8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json} (92%)
 rename data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/{25aa6e41-ab16-4f63-9613-bfb83b9151c5.json => 7d2d1dba-1b31-47b2-8308-f2668cf36c99.json} (92%)
 rename data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/{ddd52881-1248-4652-9f1d-5d2b58ede889.json => 3a056f7b-1bdf-4543-9e67-1101ace67179.json} (92%)
 rename data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/{365bc693-73b6-41fe-a8fa-eba7b91febe0.json => 275cf2e5-5ccd-40be-be55-938c82ef6688.json} (92%)
 rename data/helm_capabilities/amazon/nova-lite-v1_0/{a126b881-918a-411a-90e9-32d7b63d1e00.json => 43e7be99-4872-4eb1-b30b-75c44b298ab4.json} (92%)
 rename data/helm_capabilities/amazon/nova-micro-v1_0/{b8e54bb1-0768-4558-8dc2-4897d4e571aa.json => cfc99298-4570-48cf-9187-aa0d167cc0ba.json} (92%)
 rename data/helm_capabilities/amazon/nova-premier-v1_0/{a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json => a2162367-d16d-4274-aa89-43435cea5c0b.json} (92%)
 rename data/helm_capabilities/amazon/nova-pro-v1_0/{2413b504-7125-461b-ae9d-0c58211a5358.json => 51ef4580-da13-415a-a37f-45e2036ed4c2.json} (92%)
 rename data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/{f350d9d1-b743-4017-bc68-a4dc726515d0.json => 3fa605db-fcff-4f05-9398-6af77c9dcada.json} (92%)
 rename data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/{c32a1f0a-bf8a-42be-b155-4f87465235bc.json => 9d58ac39-fef7-47c8-920a-8be2069f5662.json} (92%)
 rename data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/{96cfde1b-77de-4d2a-8b45-938116795108.json => dd9b10af-ad39-45ef-8f91-097340d376c7.json} (92%)
 rename data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/{56c180e5-45aa-4106-8f92-c6566c3c7dfc.json => 30a6de14-c57c-483e-92e9-26fc4c7f4772.json} (92%)
 rename data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/{d633fcd6-eb01-49ff-ba7c-6ca12734746f.json => bed1a799-77a6-40a1-9f37-d54fe9d4d055.json} (92%)
 rename data/helm_capabilities/anthropic/claude-opus-4-20250514/{7a7b49ff-5060-4d12-acb9-607125fbe081.json => 6c226cad-23f1-4c09-8038-eb7b776cdee4.json} (92%)
 rename data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/{287a3646-d969-4bd9-9773-86463c1ba87f.json => 98887061-09d6-44ba-9cff-0267045a26ef.json} (92%)
 rename data/helm_capabilities/anthropic/claude-sonnet-4-20250514/{97f3892f-9588-49ef-abef-3a0c965bb352.json => 6693f0e2-3514-413d-be61-d10f7372b3dc.json} (92%)
 rename data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/{22ba68b0-6eec-47f2-b465-47f298e8da09.json => ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json} (92%)
 rename data/helm_capabilities/deepseek-ai/deepseek-r1-0528/{9e5684dc-6380-4353-b966-7205d66340fa.json => 0d9a856d-01bf-4a82-9872-33d561cf4a57.json} (92%)
 rename data/helm_capabilities/deepseek-ai/deepseek-v3/{1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json => 3ff2ab7d-2c0f-4313-8223-8f514fde595a.json} (92%)
 rename data/helm_capabilities/google/gemini-1.5-flash-002/{20512a3b-ac0f-483a-8bec-9962980c579c.json => 2a46e8da-1996-428c-b567-cd0287b29d9f.json} (92%)
 rename data/helm_capabilities/google/gemini-1.5-pro-002/{704c5c74-a0ee-457d-9b4e-3ae895ffc105.json => 30a92593-398e-4c2f-8be7-455be166aeaf.json} (92%)
 rename data/helm_capabilities/google/gemini-2.0-flash-001/{eb9224b8-0edb-4605-a2ee-cfb63f41370e.json => e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json} (92%)
 rename data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/{4cb58f80-c2b1-45c6-b781-19af47660eb0.json => dfc2717d-ead8-4287-885e-5e0fc09c35e3.json} (91%)
 rename data/helm_capabilities/google/gemini-2.5-flash-lite/{6307e0c4-c983-4257-82d8-b2a50171eb8a.json => e97292eb-7031-4a3a-a415-44c137898e3f.json} (92%)
 rename data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/{275cd615-bddf-4afe-a499-b463fe183486.json => 4263a6be-9640-40a1-8881-768624949d47.json} (92%)
 rename data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/{03b48360-a387-44ba-94b2-2eb7c234a9fa.json => a808cecf-8925-428f-99ea-b6c2f8bce96e.json} (92%)
 rename data/helm_capabilities/google/gemini-3-pro-preview/{3a242fb8-07f9-460e-93eb-345aab0f994f.json => 55e44a3b-1fac-4ad5-b25e-85702f33883d.json} (92%)
 rename data/helm_capabilities/ibm/granite-3.3-8b-instruct/{5e5720d0-67fe-40a9-b65b-d4154848d83c.json => 5b5b339b-7631-4b77-ac51-df49d3e946eb.json} (92%)
 rename data/helm_capabilities/ibm/granite-4.0-h-small/{9c9239df-0cbb-411f-af40-1b2782f91255.json => eaec6d66-6da7-4592-baca-2539240acc5d.json} (92%)
 rename data/helm_capabilities/ibm/granite-4.0-micro/{e1d12d96-185f-493e-bb08-8237623fb736.json => 2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json} (92%)
 rename data/helm_capabilities/marin-community/marin-8b-instruct/{aba1fded-b031-48df-87ef-dc744df33501.json => eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json} (92%)
 rename data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/{98f69aa6-b227-4076-a76e-1293cbe1c6cb.json => 75b5943a-67be-4b2f-85da-a52533edc76f.json} (92%)
 rename data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/{d2bb087e-a275-4fce-b6dc-001fd4545883.json => 8bec35b7-271a-457d-b665-9f69baa248aa.json} (92%)
 rename data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/{84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json => c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json} (92%)
 rename data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/{23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json => c308b0a5-4c44-4369-9b23-8664959aa927.json} (92%)
 rename data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/{9cab3a77-4f32-48d0-ba11-e2323ccc4861.json => 1a1edfb2-f0f1-4930-82c0-99293ec76645.json} (92%)
 rename data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/{9e037c92-1253-49be-b31a-3aa017531d77.json => 9aa5af51-8c55-4896-b634-162a9d82b58e.json} (92%)
 rename data/helm_capabilities/mistralai/mistral-large-2411/{bd26c7cb-ce76-4b17-b617-d1d93a168c93.json => 21461a52-2f25-48c9-be19-f9233317d817.json} (92%)
 rename data/helm_capabilities/mistralai/mistral-small-2503/{9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json => bdea0967-fcc7-493c-a18d-70727842deb9.json} (92%)
 rename data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/{d69a1cbe-353c-4be9-b93b-5224d24c7adf.json => f7404ea3-62c7-47fc-9106-44c208470381.json} (92%)
 rename data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/{915cb39d-f21f-4ef1-a95f-f44f79ede893.json => 2817820c-4b28-4235-a8fd-ad02d0f504bc.json} (92%)
 rename data/helm_capabilities/moonshotai/kimi-k2-instruct/{fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json => f3da71fc-fc88-4dda-b423-168d11eab317.json} (92%)
 rename data/helm_capabilities/openai/gpt-4.1-2025-04-14/{eb51f418-6abf-4b2c-9f57-0b830c00bd15.json => 2f7c0db9-b5de-4674-a130-5315520dea68.json} (92%)
 rename data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/{41cd14b0-46ba-49da-844a-19fe866bef1e.json => 4dcb8022-fe54-42f7-b43f-9866de173731.json} (92%)
 rename data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/{7de93642-a4bc-430b-8733-9befeb6a0e23.json => c436f3d1-84ee-49df-9287-0305925f7cf4.json} (92%)
 rename data/helm_capabilities/openai/gpt-4o-2024-11-20/{4f18292a-1fef-4feb-9b17-045c96e3e137.json => 90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json} (92%)
 rename data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/{7458c032-b24d-4f13-a659-b6e19d19a8e1.json => 07c823ba-9e17-47e4-858b-a1f2a514a276.json} (92%)
 rename data/helm_capabilities/openai/gpt-5-2025-08-07/{21eb1648-aad0-4297-9edc-c445e4c38694.json => eb1bb443-71ad-4b79-8308-2b66c5e8c631.json} (92%)
 rename data/helm_capabilities/openai/gpt-5-mini-2025-08-07/{99d657ae-e850-4caf-a599-13f1b8072273.json => e14d42a9-9639-4c35-8a0c-e395e754c46c.json} (92%)
 rename data/helm_capabilities/openai/gpt-5-nano-2025-08-07/{10cd766e-442c-4b3d-833b-740417d9a6d9.json => 3754df44-ddce-4a66-9074-f65f5677ae27.json} (92%)
 rename data/helm_capabilities/openai/gpt-5.1-2025-11-13/{bc6124a7-89df-4c3e-b824-56c948d1eeb5.json => a540b282-e9d6-403e-96df-a1d27ad14d3a.json} (92%)
 rename data/helm_capabilities/openai/gpt-oss-120b/{06719cd4-5654-49b6-9dee-e112d1601d1c.json => 758851b3-9ac9-43d8-8b6a-3d9688752d80.json} (92%)
 rename data/helm_capabilities/openai/gpt-oss-20b/{ed849999-48c2-4569-8bcd-dc73084e3197.json => 1d9ac688-ca0d-405b-a262-e95673e79250.json} (91%)
 rename data/helm_capabilities/openai/o3-2025-04-16/{01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json => c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json} (92%)
 rename data/helm_capabilities/openai/o4-mini-2025-04-16/{32382d69-21c7-43a9-bb95-27607ec18cc9.json => 35a31e19-2ef5-4caa-a848-422af42adab8.json} (92%)
 rename data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/{77e702f7-37ef-4487-b047-74b13ef6d966.json => 7de0bda2-ce56-444a-b293-a310a5b2d7ab.json} (92%)
 rename data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/{4ee3c647-740c-41a6-ac66-4a38b09317ff.json => dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json} (92%)
 rename data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/{ca30726a-00a6-4228-94fe-5dce00de1d5e.json => 9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json} (92%)
 rename data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/{7862890a-298b-4bda-b8f1-7be6a5779365.json => 07763926-3a19-43f9-a23f-095f6cb78799.json} (92%)
 rename data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/{8c73a09f-ba0d-4c12-a12a-776a17292151.json => 56e024b3-c963-4172-9f52-7605276b3854.json} (92%)
 rename data/helm_capabilities/writer/palmyra-fin/{442aed0d-95c3-4436-ad63-b7b1e93307f4.json => 6f660e47-1d86-473d-9864-208111dcea31.json} (91%)
 rename data/helm_capabilities/writer/palmyra-med/{7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json => 91ef1f96-a708-4c53-ac9d-208ef3420668.json} (91%)
 rename data/helm_capabilities/writer/palmyra-x-004/{bc2c91e0-6afd-4e44-b665-d5c7558f8981.json => c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json} (92%)
 rename data/helm_capabilities/writer/palmyra-x5/{a74b74f7-ccce-4341-a122-26728cc6bece.json => 505c6245-88d1-4557-9e34-63a4e8086210.json} (91%)
 rename data/helm_capabilities/xai/grok-3-beta/{87811b75-afe8-413b-949d-7fd1f582a2e8.json => 9a473236-f187-4926-ae8a-e8b84fe2a060.json} (91%)
 rename data/helm_capabilities/xai/grok-3-mini-beta/{ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json => 1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json} (92%)
 rename data/helm_capabilities/xai/grok-4-0709/{924080a0-c530-4e6d-b1a4-107de3bd7183.json => aeabfb59-74db-445c-9693-7a088ac5073c.json} (91%)
 rename data/helm_capabilities/zai-org/glm-4.5-air-fp8/{be23c720-a99a-4945-bc0b-ddc27c8eec39.json => eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json} (92%)
 rename data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/{425d4a41-2def-4581-9b61-ee33ecb3a822.json => 12fdea65-94eb-4c85-876c-65f0528bde12.json} (91%)
 rename data/helm_classic/ai21/J1-Grande-v1-17B/{c12a8494-bafc-4097-874a-7c00636e96f8.json => d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json} (92%)
 rename data/helm_classic/ai21/J1-Grande-v2-beta-17B/{4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json => 1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json} (92%)
 rename data/helm_classic/ai21/J1-Jumbo-v1-178B/{19f61327-fcc3-408f-9254-2d6a2aadcd4e.json => deddbc80-70ac-43e7-b052-753d127f8390.json} (92%)
 rename data/helm_classic/ai21/J1-Large-v1-7.5B/{ccc17d56-bd26-409c-ac3f-262eaba9ce21.json => e4780862-bf3c-4856-b1e7-02616afe931a.json} (92%)
 rename data/helm_classic/ai21/Jurassic-2-Grande-17B/{f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json => cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json} (92%)
 rename data/helm_classic/ai21/Jurassic-2-Jumbo-178B/{9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json => 13a22d40-f274-4384-adcc-1539da821c6a.json} (92%)
 rename data/helm_classic/ai21/Jurassic-2-Large-7.5B/{f25c142c-8730-4241-a649-01d076e1f28d.json => a01f642e-730b-461d-8afe-9c077ab3f149.json} (91%)
 rename data/helm_classic/aleph-alpha/Luminous-Base-13B/{ab34f23e-36db-40c0-9681-f30b00692f98.json => 813802a3-483e-443d-9e49-7cd581b5ea6d.json} (91%)
 rename data/helm_classic/aleph-alpha/Luminous-Extended-30B/{67281534-a03d-49d8-a586-25cb1a03134e.json => 90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json} (91%)
 rename data/helm_classic/aleph-alpha/Luminous-Supreme-70B/{3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json => d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json} (91%)
 rename data/helm_classic/bigscience/BLOOM-176B/{04ce2ba4-c382-4658-ba06-1def9499a243.json => 3dc29785-a884-4496-a6f4-a8bf19892e50.json} (91%)
 rename data/helm_classic/bigscience/T0pp-11B/{3a546396-d031-4958-8410-00e0d3406089.json => ff8dc291-bbaf-4149-854e-e1780b0c86d5.json} (93%)
 rename data/helm_classic/cohere/Cohere-Command-beta-52.4B/{e7b99aa6-08e8-4224-a805-16586eb44325.json => b8932181-b669-4b0e-8879-1dfbf9afea12.json} (92%)
 rename data/helm_classic/cohere/Cohere-Command-beta-6.1B/{43a3fe19-929a-463d-a0ed-791dad765188.json => c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json} (92%)
 rename data/helm_classic/cohere/Cohere-large-v20220720-13.1B/{75468958-b75b-41fe-9813-070b793e86d9.json => 579fb908-3c36-4ff8-a262-fd5388806b83.json} (92%)
 rename data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/{6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json => 68ff9f10-0357-4ea8-b758-de6c7f51d669.json} (92%)
 rename data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/{3c9c425a-ce4a-4958-9744-7f9490ed5729.json => b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json} (92%)
 rename data/helm_classic/cohere/Cohere-small-v20220720-410M/{5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json => 8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json} (92%)
 rename data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/{8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json => 8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json} (92%)
 rename data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/{f8044c74-3f1c-4562-a21c-e448061b2077.json => 6bbe052f-46f7-4541-80a3-dbb86433db7a.json} (92%)
 rename data/helm_classic/eleutherai/Pythia-12B/{4abe3a0d-ba04-41f7-b107-59f11ff5697a.json => 9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json} (91%)
 rename data/helm_classic/eleutherai/Pythia-6.9B/{646adb7b-0761-4639-8776-83ea158bfca4.json => 742a59e8-c813-42ef-938a-4897e25dcdad.json} (91%)
 rename data/helm_classic/google/Palmyra-X-43B/{85cf6be2-d066-4e1b-b373-d53d3c922184.json => 5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json} (91%)
 rename data/helm_classic/google/T5-11B/{52db5c6d-b54e-401a-880d-8ab41a394bc4.json => 509360bc-86f5-49dc-899c-2899d8b6bc6c.json} (91%)
 rename data/helm_classic/google/UL2-20B/{68becad6-9455-4d3d-8d68-d1b4448598a1.json => 8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json} (91%)
 rename data/helm_classic/lmsys/Vicuna-v1.3-13B/{519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json => 8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json} (91%)
 rename data/helm_classic/lmsys/Vicuna-v1.3-7B/{972bc5db-f536-42f9-aa51-83cc2f59b76a.json => 7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json} (91%)
 rename data/helm_classic/meta/LLaMA-13B/{b2220101-56e0-49d9-a3d1-d3bec769ab97.json => d65d8f48-8b8e-4ec6-af68-f61af5408adf.json} (91%)
 rename data/helm_classic/meta/LLaMA-30B/{96907b25-05c3-441b-afc4-69274c20bfc3.json => dff69882-cb8b-4323-b587-60f295085459.json} (91%)
 rename data/helm_classic/meta/LLaMA-65B/{66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json => 90220411-5e4d-4b74-a74c-ca2ad030d50e.json} (91%)
 rename data/helm_classic/meta/LLaMA-7B/{70e9e156-6807-489b-b77a-367236614826.json => 8c2465b2-deca-476c-bb41-836685ceab35.json} (91%)
 rename data/helm_classic/meta/Llama-2-13B/{e90cfb46-1173-4d22-9329-9bf57cdd5241.json => 4b0f6a03-1054-4047-82d1-53992f0378ee.json} (91%)
 rename data/helm_classic/meta/Llama-2-70B/{baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json => 78bc128a-6e53-4086-9498-2b3428e1d884.json} (91%)
 rename data/helm_classic/meta/Llama-2-7B/{7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json => 2be7887e-6c91-437c-bbfc-8b68de3330da.json} (91%)
 rename data/helm_classic/meta/OPT-175B/{ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json => f135ce21-655f-4ebf-9cc6-d83ada0f177b.json} (92%)
 rename data/helm_classic/meta/OPT-66B/{26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json => 48912a61-af54-4208-b36d-2f3a283e5c5d.json} (92%)
 rename data/helm_classic/microsoft/TNLG-v2-530B/{ecd21c26-cdc4-43b1-b933-4d970df9413a.json => cc85315f-4472-4b22-9f0a-e4609676ce13.json} (91%)
 rename data/helm_classic/microsoft/TNLG-v2-6.7B/{9d4350eb-cdf0-432f-b3b0-45f4832ca950.json => ab773619-db5e-449b-8d6b-da743cb038bb.json} (91%)
 rename data/helm_classic/mistralai/Mistral-v0.1-7B/{3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json => 5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json} (91%)
 rename data/helm_classic/mosaicml/MPT-30B/{b277c87e-54b5-466f-97d7-35db4cd7b985.json => 32cc2aa3-be26-41bd-8124-a8b1073c84c4.json} (91%)
 rename data/helm_classic/mosaicml/MPT-Instruct-30B/{270df23b-9e58-4259-a8ed-0d25b9c80b2a.json => 42a86a4a-7e76-4c7d-af48-e765a38df589.json} (91%)
 rename data/helm_classic/openai/GPT-J-6B/{1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json => f9746ed1-887f-4850-ac2d-700de18acbaf.json} (92%)
 rename data/helm_classic/openai/GPT-NeoX-20B/{ef171b67-72a6-46d3-9eaf-4614ff474852.json => 899521d0-e5eb-4e1b-af5a-78b3bd32e232.json} (92%)
 rename data/helm_classic/openai/ada-350M/{e6ea5f7e-0533-4a99-8638-1cc10c454238.json => 1fb2c6db-2495-4609-a96b-57815c579953.json} (94%)
 rename data/helm_classic/openai/babbage-1.3B/{83c924fe-6318-4bad-adb0-8a81e5e28ee0.json => a5b6cc8b-676d-4c19-8093-0b893937e3d4.json} (94%)
 rename data/helm_classic/openai/curie-6.7B/{82e2c0e3-66f2-431f-b4b8-d2495970d998.json => 0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json} (94%)
 rename data/helm_classic/openai/davinci-175B/{6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json => bc207557-fb49-4a87-8401-22c3ce853e7c.json} (94%)
 rename data/helm_classic/openai/gpt-3.5-turbo-0301/{e18fbf9e-677c-49fb-ab76-475e8f605f01.json => 895266ee-71a5-4ca5-b3f9-62df6383ff95.json} (91%)
 rename data/helm_classic/openai/gpt-3.5-turbo-0613/{039af363-0c5c-4e36-8396-cd57c7e4c1de.json => 8828e9e8-5716-41b4-a2d1-233bb056dc32.json} (91%)
 rename data/helm_classic/openai/text-ada-001/{8ea1facb-260a-461d-9271-2c07b318c46f.json => f267ba72-b239-4126-99c5-675f79b1ae95.json} (94%)
 rename data/helm_classic/openai/text-babbage-001/{93007ac9-04c2-451d-abd2-2f235297747e.json => f386e763-8078-454b-bd14-32b106663d53.json} (94%)
 rename data/helm_classic/openai/text-curie-001/{b04e5f90-e46e-4d7a-a6a9-569bde072208.json => a4739cda-028b-48e0-b3b5-ca9b583d03f5.json} (94%)
 rename data/helm_classic/openai/text-davinci-002/{933dc76f-45f0-48e0-93ae-3e19cff87c2a.json => 837e20ff-fed1-4431-b643-63b904055c66.json} (94%)
 rename data/helm_classic/openai/text-davinci-003/{b8408a64-eb89-4337-8ee5-3c48e4e24437.json => e411f017-22c6-4d49-9bf9-5d99c1091791.json} (94%)
 rename data/helm_classic/stanford/Alpaca-7B/{d5846321-0800-4ff9-b85c-53c8b4884ba5.json => 7bd2b266-5a65-4c63-bf18-5e4114564bfc.json} (91%)
 rename data/helm_classic/tiiuae/Falcon-40B/{baa5f92c-b626-4e09-a084-61ce7f5dee98.json => 49a1423e-d5f4-4665-b81e-d491f492a316.json} (91%)
 rename data/helm_classic/tiiuae/Falcon-7B/{9b648e90-8d3c-403d-9ad8-382ef0b212a6.json => 8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json} (91%)
 rename data/helm_classic/tiiuae/Falcon-Instruct-40B/{0692f762-337e-4c20-8ad6-feecc93882a3.json => ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json} (91%)
 rename data/helm_classic/tiiuae/Falcon-Instruct-7B/{a91c9563-0756-4616-8a58-3c8000f73895.json => a2b4ed40-b04f-481f-986b-25a2c26bbb79.json} (91%)
 rename data/helm_classic/together/RedPajama-INCITE-Base-7B/{3a329574-dcf6-4177-b37c-c495e6af6cc5.json => e88f9163-5334-43ed-9b56-154bf543f898.json} (91%)
 rename data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/{9e662c1e-e77c-4fb3-b589-127683a4b2ca.json => 6d436bd5-9d49-4895-8c07-7814b2eef12c.json} (91%)
 rename data/helm_classic/together/RedPajama-INCITE-Instruct-7B/{375140f6-bd3f-4b55-a35c-23de37254296.json => 681d0d6d-de06-4b8e-a7e2-964d98e2806e.json} (91%)
 rename data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/{021d0b25-8f58-47da-a58c-ac532a7972bf.json => e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json} (91%)
 rename data/helm_classic/writer/InstructPalmyra-30B/{9207fec4-d0c4-4f66-b917-f5ed57409215.json => cb80bd5f-204a-4dd8-96ec-40c7df93975f.json} (91%)
 rename data/helm_classic/yandex/YaLM-100B/{b04c8845-cccf-4856-9597-ab283bb2ec8d.json => f84f84a8-7191-42ac-8951-5d7141a0f700.json} (91%)
 rename data/helm_classic/zhipu-ai/GLM-130B/{4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json => 9ba74767-b675-460a-bb68-e82adb6acd2f.json} (91%)
 rename data/helm_instruct/anthropic/claude-v1.3/{0e30e895-aaf7-42d4-95db-7541d6b41c87.json => e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json} (73%)
 rename data/helm_instruct/cohere/command-xlarge-beta/{4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json => 60724488-914d-4efe-98d6-f3ff26fe8fbc.json} (74%)
 rename data/helm_instruct/openai/gpt-3.5-turbo-0613/{8befd29c-a16d-4e05-a92f-00b621d45e03.json => 2aaae404-b510-41e0-9a4a-b2d053731454.json} (74%)
 rename data/helm_instruct/openai/gpt-4-0314/{b2e193b8-215b-4e80-9d5a-df11f1dac88a.json => 053badb4-b50a-434a-909c-c4d939c00b4e.json} (73%)
 rename data/helm_lite/01-ai/yi-34b/{eedd0f38-6d26-4297-a469-291227ec6be6.json => 7b4a4c6d-e302-4010-a099-5b01c874ffe8.json} (85%)
 rename data/helm_lite/01-ai/yi-6b/{74c47665-740f-4784-8a27-1c1d1c29bff8.json => db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json} (85%)
 rename data/helm_lite/01-ai/yi-large-preview/{8027b577-7f48-4df5-9879-bd45ac342f42.json => f6808908-79d9-4de5-8434-94e4bdb854f2.json} (85%)
 rename data/helm_lite/AlephAlpha/luminous-base/{e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json => 1a039ef6-5957-4246-82b2-bc607b6554e7.json} (85%)
 rename data/helm_lite/AlephAlpha/luminous-extended/{24e11e7b-15d6-4a09-9545-38486d0eb236.json => fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json} (86%)
 rename data/helm_lite/AlephAlpha/luminous-supreme/{eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json => 0e2790d3-40f1-4124-ba41-b65bd9de1852.json} (85%)
 rename data/helm_lite/ai21/j2-grande/{52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json => d55129d3-4eae-4009-a897-fa1624cea6a2.json} (85%)
 rename data/helm_lite/ai21/j2-jumbo/{68713712-ae92-474b-84c0-1b8301538439.json => 6332f0b3-7fab-41ed-a8da-46b142051377.json} (85%)
 rename data/helm_lite/ai21/jamba-1.5-large/{15cc9411-6ea4-4f10-831f-23ff27fd5704.json => 0cb33741-ca10-40f5-90d3-28e300901ad3.json} (85%)
 rename data/helm_lite/ai21/jamba-1.5-mini/{3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json => 80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json} (85%)
 rename data/helm_lite/ai21/jamba-instruct/{1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json => de41775f-f60e-481e-a8ef-3df9a9b65a5a.json} (85%)
 rename data/helm_lite/allenai/olmo-7b/{078d812b-2198-4497-8fbe-06fb640fd86d.json => bc29d5c6-b5c8-473b-b69c-054026829089.json} (85%)
 rename data/helm_lite/amazon/nova-lite-v1_0/{f928a53d-9d67-45e7-a871-04359c8162d5.json => ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json} (85%)
 rename data/helm_lite/amazon/nova-micro-v1_0/{741c4560-eb35-4edf-a48b-af29e743740a.json => 4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json} (85%)
 rename data/helm_lite/amazon/nova-pro-v1_0/{4e8a8384-5f1d-4b76-be9b-385407332d6c.json => 9ef56d5a-de00-4d89-930c-a4c74211dd78.json} (85%)
 rename data/helm_lite/anthropic/claude-2.0/{0684c1d2-ea43-4341-820c-09051f5e11f2.json => 5598d3ed-5b37-4aec-b186-0b16c394633b.json} (85%)
 rename data/helm_lite/anthropic/claude-2.1/{51821ca1-7eac-4094-abac-98b2484cc5a0.json => a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json} (85%)
 rename data/helm_lite/anthropic/claude-3-5-haiku-20241022/{8a0f5749-7f6a-4813-9c08-7283433c1337.json => 54bac699-aa82-4133-8c10-c6510c2a7f95.json} (86%)
 rename data/helm_lite/anthropic/claude-3-5-sonnet-20240620/{4697983d-a29a-484d-9268-7974117456e8.json => 79b23601-3148-4256-88ce-67e439a87c5b.json} (86%)
 rename data/helm_lite/anthropic/claude-3-5-sonnet-20241022/{60e33aa3-0593-42e6-9baa-8311746deca0.json => e92648e4-75c6-4944-9ec1-880823fefc87.json} (86%)
 rename data/helm_lite/anthropic/claude-3-haiku-20240307/{2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json => 449feffd-d2e3-4a08-ad69-b8ad522532ae.json} (86%)
 rename data/helm_lite/anthropic/claude-3-opus-20240229/{9ad91ee2-7a64-4f94-9166-f2681777023b.json => d297b253-0f4f-4caf-864b-9f457ab589da.json} (86%)
 rename data/helm_lite/anthropic/claude-3-sonnet-20240229/{4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json => d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json} (86%)
 rename data/helm_lite/anthropic/claude-instant-1.2/{64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json => cb409208-034d-42fd-acce-ab5cc4227383.json} (86%)
 rename data/helm_lite/anthropic/claude-v1.3/{fe8a36b0-4361-461b-b310-656c54131fa6.json => b2572ef8-446a-45b4-b557-45736418753b.json} (85%)
 rename data/helm_lite/cohere/command-light/{b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json => 70d85516-b710-4b27-b664-03a6a822773b.json} (85%)
 rename data/helm_lite/cohere/command-r-plus/{67967a2a-5fb4-46e8-b1ec-eda1588d9086.json => a8208df4-eb37-47d2-8845-f821e80e9858.json} (85%)
 rename data/helm_lite/cohere/command-r/{0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json => 22cde248-40ab-43b0-a408-6d8b84692f22.json} (85%)
 rename data/helm_lite/cohere/command/{ba5eea81-2120-4a20-8322-dfbd29cd197c.json => b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json} (85%)
 rename data/helm_lite/databricks/dbrx-instruct/{9dd66ede-da5c-4627-92ed-7057c9a2bea3.json => ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json} (85%)
 rename data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/{801aa7da-90b2-48d1-ad3d-943b06bd437c.json => 8721a15b-9102-4b1a-bde8-e5371f00f1b5.json} (85%)
 rename data/helm_lite/deepseek-ai/deepseek-v3/{a58923ea-fa22-4c45-8327-efbe84c8a05d.json => 23b3a30c-8aa3-4684-be54-adae003720fc.json} (85%)
 rename data/helm_lite/google/gemini-1.0-pro-002/{bab8d241-fad0-4230-b213-c2eeccc79f12.json => 7022c444-d6b8-4374-be0c-14835e5fd281.json} (85%)
 rename data/helm_lite/google/gemini-1.5-flash-001/{65e37589-ef26-46cd-a627-798af70e75bf.json => bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json} (85%)
 rename data/helm_lite/google/gemini-1.5-flash-002/{f499f9c6-4c9a-43ba-b4c3-d094494a371c.json => bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json} (85%)
 rename data/helm_lite/google/gemini-1.5-pro-001/{27a54446-57b2-4239-b768-7ab85dc94c54.json => 527418d0-2591-43c9-b639-17328292b110.json} (85%)
 rename data/helm_lite/google/gemini-1.5-pro-002/{5de8a13e-a029-4a90-9a2d-c28a59212140.json => 8ddc465f-4f2d-4213-81c4-70b584d48047.json} (85%)
 rename data/helm_lite/google/gemini-2.0-flash-exp/{f9643ce2-7347-401b-903e-fadcc5221f36.json => eca63d17-7fc2-4722-8bb3-0be99a257100.json} (85%)
 rename data/helm_lite/google/gemma-2-27b-it/{9932e430-2039-40b0-bc8f-ae2d833543e8.json => e40a10b3-e682-4715-b2ee-4efcae050a58.json} (85%)
 rename data/helm_lite/google/gemma-2-9b-it/{dbd2e9bb-c2ca-4165-b229-d736a70721a5.json => 56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json} (85%)
 rename data/helm_lite/google/gemma-7b/{32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json => f47ca10d-cd45-485e-b9cf-0c6592d63656.json} (85%)
 rename data/helm_lite/google/text-bison@001/{70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json => 7f0e318e-31bf-4044-bffb-357c1238d4fd.json} (85%)
 rename data/helm_lite/google/text-unicorn@001/{07a367ee-2879-4ede-bbf8-33b24d682467.json => 818d6d72-0b5c-4fcf-b808-1d186223301e.json} (85%)
 rename data/helm_lite/meta/llama-2-13b/{fee914c7-d6bf-4d61-9f50-71bae5f11006.json => f09b853b-dbbc-4252-a0f0-a2c45c29f670.json} (85%)
 rename data/helm_lite/meta/llama-2-70b/{b0577066-231e-461b-bae8-b724b204397a.json => f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json} (85%)
 rename data/helm_lite/meta/llama-2-7b/{b79fe2e3-5eec-46f8-90a1-810781c8c46a.json => 83c6a723-87a0-43d4-968e-86d186578e9e.json} (85%)
 rename data/helm_lite/meta/llama-3-70b/{998616ef-5d1b-4c65-b6ad-23afc3630d5a.json => daaf221b-1759-4619-91fb-938e81975787.json} (85%)
 rename data/helm_lite/meta/llama-3-8b/{fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json => 6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json} (85%)
 rename data/helm_lite/meta/llama-3.1-405b-instruct-turbo/{25fde5e6-86b8-4a80-8f79-5946ef9999fc.json => 1043b815-b247-4444-bf8c-0b92b793c57f.json} (86%)
 rename data/helm_lite/meta/llama-3.1-70b-instruct-turbo/{b955825d-ae7f-48c4-9dad-5ee78879737d.json => 28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json} (86%)
 rename data/helm_lite/meta/llama-3.1-8b-instruct-turbo/{168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json => 73dedd31-7d40-4ee6-994d-00eb7d656597.json} (86%)
 rename data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/{0807e353-9787-4ca0-8f7b-50d1bed2469e.json => 18da1dfa-5366-477b-a9cf-af29c5a99b68.json} (85%)
 rename data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/{0164b885-2c27-4eba-8e6f-e69156cb0dee.json => 80057cc1-45ab-4976-878e-be963eaa83b1.json} (85%)
 rename data/helm_lite/meta/llama-3.3-70b-instruct-turbo/{08422837-51a0-45c9-9004-fc5d98dce462.json => d896249f-bbd9-4657-a5db-5968544cb5fa.json} (86%)
 rename data/helm_lite/meta/llama-65b/{39f2c7f2-56d4-4349-95ae-374d34263f48.json => 9f73f3e5-b573-45d4-8c98-82f5c496f786.json} (85%)
 rename data/helm_lite/microsoft/phi-2/{0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json => a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json} (85%)
 rename data/helm_lite/microsoft/phi-3-medium-4k-instruct/{75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json => 4ff688da-61a0-43ce-9c2d-e1c197887683.json} (86%)
 rename data/helm_lite/microsoft/phi-3-small-8k-instruct/{2de4b89a-3f3b-4d1d-ba85-030953a46956.json => 181003ea-7587-4c93-8b89-c5c76958313d.json} (85%)
 rename data/helm_lite/mistralai/mistral-7b-instruct-v0.3/{bd68405f-fe9a-448b-9c80-468c656594e5.json => 66688228-e59a-4caa-b3fb-c5df1efc9db4.json} (86%)
 rename data/helm_lite/mistralai/mistral-7b-v0.1/{4267fef1-3180-46e3-990e-0d1092ec4c18.json => 2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json} (85%)
 rename data/helm_lite/mistralai/mistral-large-2402/{002a34dc-39e5-451d-b2a8-b51bdb69a056.json => 077fe37f-b3a4-483a-93a5-034c6445fe98.json} (86%)
 rename data/helm_lite/mistralai/mistral-large-2407/{5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json => 4fbb173c-b900-4e11-87bd-1ac6a489d014.json} (86%)
 rename data/helm_lite/mistralai/mistral-medium-2312/{ad2beded-cec3-4b47-b8de-a32a3225fa66.json => e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json} (86%)
 rename data/helm_lite/mistralai/mistral-small-2402/{eb901347-fc1f-4d8f-a70a-05a83e16658d.json => 0925f9b7-08f8-485f-84bc-a153a54aa417.json} (86%)
 rename data/helm_lite/mistralai/mixtral-8x22b/{9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json => 08082277-8305-4007-97cd-88202fc0115c.json} (85%)
 rename data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/{042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json => fe554cbd-2480-40bd-b2f5-464cad700c14.json} (86%)
 rename data/helm_lite/mistralai/open-mistral-nemo-2407/{d2d48e4a-0484-4f44-8108-2e689d7ca695.json => 9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json} (86%)
 rename data/helm_lite/openai/gpt-3.5-turbo-0613/{e54ae605-a91d-47d7-a08d-67bd0ea5c606.json => d9654997-1d3e-41c3-9f16-05a36dde9b02.json} (85%)
 rename data/helm_lite/openai/gpt-4-0613/{15dccf75-871d-457b-8495-e0d03d550360.json => 73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json} (85%)
 rename data/helm_lite/openai/gpt-4-1106-preview/{18fe5d30-bf36-405a-819e-1ecabda327ea.json => 4d01d929-b5e2-42dc-89ee-20560f560db5.json} (85%)
 rename data/helm_lite/openai/gpt-4-turbo-2024-04-09/{cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json => 76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json} (86%)
 rename data/helm_lite/openai/gpt-4o-2024-05-13/{cd199905-04a4-4745-b848-4f7bde97ca17.json => 69ea0ef0-c136-4cff-9607-6ae12e0692c3.json} (85%)
 rename data/helm_lite/openai/gpt-4o-2024-08-06/{1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json => bbe708f3-fb78-49e9-876d-cae57f1231cc.json} (85%)
 rename data/helm_lite/openai/gpt-4o-mini-2024-07-18/{bfd70aff-bf45-4f55-b730-4924afc181cd.json => ab7b7951-0792-4538-8a7a-6baee8602cbb.json} (86%)
 rename data/helm_lite/openai/text-davinci-002/{b6e08679-1bd7-42a1-9eee-98252de2c7c1.json => fc94c95d-9678-4f23-b82f-190a08ece307.json} (85%)
 rename data/helm_lite/openai/text-davinci-003/{22b411d5-a314-4b17-a9c7-c1af7ca7df33.json => 3f92e2fc-9831-4c2c-b94e-af33d457fa82.json} (85%)
 rename data/helm_lite/qwen/qwen1.5-110b-chat/{f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json => 3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json} (85%)
 rename data/helm_lite/qwen/qwen1.5-14b/{fb1bb023-16f6-4914-889b-6458d7ab1277.json => 6b2891bd-2444-4286-8ccf-c91181856d29.json} (85%)
 rename data/helm_lite/qwen/qwen1.5-32b/{8b572c10-3553-4e51-a321-bdb05996914b.json => bd924bd3-e13c-48e0-b339-8c15c5072038.json} (85%)
 rename data/helm_lite/qwen/qwen1.5-72b/{6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json => b8a6f32a-9904-43bb-9add-89404093a9db.json} (85%)
 rename data/helm_lite/qwen/qwen1.5-7b/{e0efe169-d28e-418e-a78c-9b04ec29aae2.json => c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json} (85%)
 rename data/helm_lite/qwen/qwen2-72b-instruct/{05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json => 9c1fc50a-437d-458b-926c-33cabdcc4aeb.json} (85%)
 rename data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/{983696ae-d7f3-48a4-b7a0-a42487728182.json => 5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json} (86%)
 rename data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/{a969e516-adef-4839-9252-244c58ab3c67.json => 10e1abfa-83de-4960-8d4c-c5099894cb80.json} (86%)
 rename data/helm_lite/snowflake/snowflake-arctic-instruct/{f122f9de-b1ce-40ea-8731-6c00c7af0498.json => 40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json} (86%)
 rename data/helm_lite/tiiuae/falcon-40b/{5c7982c5-3513-4ff2-9857-33a0db825376.json => 2abf3bb8-a78f-4a59-807e-52da4e6426fd.json} (85%)
 rename data/helm_lite/tiiuae/falcon-7b/{4910859a-750c-4728-bf30-309e0e81690e.json => ae28615a-b7fa-4782-89e1-4b8e4804dc62.json} (85%)
 rename data/helm_lite/upstage/solar-pro-241126/{32f0532f-b504-492d-84d7-f541930edad0.json => 52bb6ab9-e80b-4bf0-a375-7706f16d311d.json} (85%)
 rename data/helm_lite/writer/palmyra-x-004/{04c187a3-4532-4523-b39d-19314d61c779.json => fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json} (85%)
 rename data/helm_lite/writer/palmyra-x-v2/{4440532c-9b49-4c9a-8bf4-f122531c54fa.json => 1158720a-9a0e-492e-a677-9b0936f4cde5.json} (85%)
 rename data/helm_lite/writer/palmyra-x-v3/{bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json => 254ded81-4051-420d-b402-2e7b80a23848.json} (85%)
 rename data/helm_mmlu/01-ai/yi-34b/{3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json => ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json} (90%)
 rename data/helm_mmlu/01-ai/yi-6b/{6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json => 7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json} (90%)
 rename data/helm_mmlu/01-ai/yi-large-preview/{3d0b3d68-a853-4989-a35e-83ac6722c2da.json => 5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json} (90%)
 rename data/helm_mmlu/ai21/jamba-1.5-large/{ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json => 0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json} (90%)
 rename data/helm_mmlu/ai21/jamba-1.5-mini/{517e8027-6edd-482b-86f3-33b6c41a9609.json => 92e0b1b9-c167-4e07-b770-2b78527eb4eb.json} (90%)
 rename data/helm_mmlu/ai21/jamba-instruct/{f7c1c125-ad0f-4847-b880-4f705f1666c6.json => 3da06ad4-0770-45f5-a6a2-9ef9500cef05.json} (90%)
 rename data/helm_mmlu/allenai/olmo-1.7-7b/{5a0ba280-8a12-4735-9d92-4ed71ba395b4.json => c1c79360-60bd-4f5d-a746-e0411b94f69b.json} (90%)
 rename data/helm_mmlu/allenai/olmo-7b/{73ccc6a6-e10d-4619-914f-26032cddf8da.json => bb904716-048c-4b41-9f64-4d17c485afe3.json} (90%)
 rename data/helm_mmlu/amazon/nova-lite-v1_0/{20c5af59-ff73-4731-9230-f92bb86e657b.json => 063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json} (90%)
 rename data/helm_mmlu/amazon/nova-micro-v1_0/{fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json => c8949c55-8987-4ed3-b74b-8b13b4381806.json} (90%)
 rename data/helm_mmlu/amazon/nova-pro-v1_0/{d30617fc-8d64-4070-b86a-c982025cfcea.json => ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json} (90%)
 rename data/helm_mmlu/anthropic/claude-2.1/{aa8cae95-cb75-4241-951c-25e2046042dd.json => bc9cedd7-5cb2-44b2-abda-470322570e14.json} (90%)
 rename data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/{c88e4a03-22ae-4338-bf5f-36070814136a.json => 305a7f25-6e22-4146-9678-6a687a701567.json} (90%)
 rename data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/{4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json => c6059976-85a1-40ce-b02f-67e182aa2f7d.json} (90%)
 rename data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/{ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json => 6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json} (90%)
 rename data/helm_mmlu/anthropic/claude-3-haiku-20240307/{097a8da1-f411-4359-8440-2ab06f4ae76c.json => f397ca7a-41c4-4926-b075-2523639f0a50.json} (90%)
 rename data/helm_mmlu/anthropic/claude-3-opus-20240229/{68130abd-1df5-4cd3-919a-2863e9f013c7.json => acdf4701-e1c2-4867-bd85-d34ae8fb0991.json} (90%)
 rename data/helm_mmlu/anthropic/claude-3-sonnet-20240229/{5d8d795a-d213-4b96-9b17-ad5fae6b3687.json => 3cd855af-9679-4fd0-bc3f-34db697c7855.json} (90%)
 rename data/helm_mmlu/anthropic/claude-instant-1.2/{7908da03-f030-4c62-a121-c04bd94ea75e.json => 78fb6814-e32f-4b15-b958-9e001637ba07.json} (90%)
 rename data/helm_mmlu/cohere/command-r-plus/{c6fdbf96-2500-4410-8fcd-268ea3e16062.json => f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json} (90%)
 rename data/helm_mmlu/cohere/command-r/{537164c3-7b88-4543-b19d-370f55a25a66.json => cefc3b25-0779-4fb3-93a5-3c7a285304af.json} (90%)
 rename data/helm_mmlu/databricks/dbrx-instruct/{0c539e26-8403-42db-acfc-7953dd80ae20.json => 7e00e082-0e79-45e0-b0ff-5458cc2aff85.json} (90%)
 rename data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/{364c7490-8bb1-4e7e-b485-fb3c2224da58.json => ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json} (90%)
 rename data/helm_mmlu/deepseek-ai/deepseek-v3/{1a9167d2-882c-4582-b4e0-ac425896a317.json => c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json} (90%)
 rename data/helm_mmlu/google/gemini-1.0-pro-001/{8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json => 7ea5b404-d98f-4282-81d8-6ca5f6629429.json} (90%)
 rename data/helm_mmlu/google/gemini-1.5-flash-001/{d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json => 7056c7e7-f68a-4764-aa48-a8368ae2e317.json} (90%)
 rename data/helm_mmlu/google/gemini-1.5-flash-002/{a94c9e13-dca7-4e02-a795-09d9274354d3.json => 5e67014d-6ca1-4e65-a85a-84d91e147d4d.json} (90%)
 rename data/helm_mmlu/google/gemini-1.5-flash-preview-0514/{75c8b20f-a4d4-4699-be79-f027bf7f0d69.json => 3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json} (90%)
 rename data/helm_mmlu/google/gemini-1.5-pro-001/{264be7b4-08b7-40b6-a5e7-f3536f361450.json => 46d5e547-507e-4c98-98a9-bad1bfad7f7b.json} (90%)
 rename data/helm_mmlu/google/gemini-1.5-pro-002/{83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json => ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json} (90%)
 rename data/helm_mmlu/google/gemini-1.5-pro-preview-0409/{8a013eb3-0f21-4a50-8a53-4ba977951130.json => 2b31b441-caa9-465c-a2d2-051c951c7be3.json} (90%)
 rename data/helm_mmlu/google/gemini-2.0-flash-exp/{7b081a40-7cb6-4405-b842-3db95f290dfa.json => b7ea6c93-af70-4c0f-ba50-03a539416a8b.json} (90%)
 rename data/helm_mmlu/google/gemma-2-27b/{54185b53-9891-43c6-8f93-09ff02b728d8.json => fe4cec30-e483-49a8-80ea-00b2c6231740.json} (90%)
 rename data/helm_mmlu/google/gemma-2-9b/{884c194d-6519-4bd4-8add-6514e593c514.json => 53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json} (90%)
 rename data/helm_mmlu/google/gemma-7b/{a80cbd76-bcf8-4174-b0b3-346fae152bdb.json => af88b02d-cb29-4d2c-bb33-5fddcf316a95.json} (90%)
 rename data/helm_mmlu/google/text-bison@001/{5f105986-aa7d-4858-91bc-cece9d0085ba.json => a0abcd19-58a1-478a-9786-d044a4181241.json} (90%)
 rename data/helm_mmlu/google/text-unicorn@001/{528b7b4e-c8a6-4387-bd98-497a3316029d.json => 95eda13a-cd34-4170-b2db-f2ead47250f9.json} (90%)
 rename data/helm_mmlu/meta/llama-2-13b/{96eb34db-66bd-4945-8b4c-a8c1394fe56a.json => 7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json} (90%)
 rename data/helm_mmlu/meta/llama-2-70b/{961e917b-0e67-462c-b9d0-0fe4b4b85beb.json => 9da7439c-e96b-444f-b4fa-7ef638080740.json} (90%)
 rename data/helm_mmlu/meta/llama-2-7b/{59a85d2c-16ce-4ed4-bc65-f6898127fa57.json => 294b22a0-1676-4d8c-8ad2-5cdc40267255.json} (90%)
 rename data/helm_mmlu/meta/llama-3-70b/{16a8b446-51fc-4c23-9231-46ee16c1c0a8.json => 1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json} (90%)
 rename data/helm_mmlu/meta/llama-3-8b/{f4de7e58-7060-440b-8f6f-1f79d7499d1e.json => 78f2484e-bc73-4026-929b-db345e92cf5a.json} (90%)
 rename data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/{5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json => 8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json} (90%)
 rename data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/{dc6aa933-67e4-4811-b3e2-e5200c002abe.json => 41af381a-3637-4578-a582-59d9b1327d95.json} (90%)
 rename data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/{5f9758a3-fd6d-4598-930a-9c01420d05e8.json => 96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json} (90%)
 rename data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/{7592c0d8-a06c-4189-81a1-dbf794d22c8b.json => bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json} (90%)
 rename data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/{83c0e8e3-087c-4d61-9153-e571b4971871.json => e036de72-b425-4aa5-9448-dc52560e60db.json} (90%)
 rename data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/{c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json => 65423181-18f1-4296-98c2-171356106404.json} (90%)
 rename data/helm_mmlu/microsoft/phi-2/{5baac093-babb-41cd-a2f4-985d0b91be37.json => 41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json} (90%)
 rename data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/{1bf54088-ba12-45b4-8f80-63d5c38f58f6.json => f78d6e0a-a397-4a41-a37e-696bda5a1987.json} (90%)
 rename data/helm_mmlu/microsoft/phi-3-small-8k-instruct/{5ed0a970-200f-4f23-9623-e714afa49ddf.json => d2bf70ce-341f-49d7-bd03-87b523826953.json} (90%)
 rename data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/{e7fd06a6-65e5-4f88-8e86-c513f78e31db.json => b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json} (90%)
 rename data/helm_mmlu/mistralai/mistral-7b-v0.1/{ac047aef-008f-4c87-a6d5-4f331ebf5c53.json => 08590b6e-7050-413d-844b-1f3f1c5aa444.json} (90%)
 rename data/helm_mmlu/mistralai/mistral-large-2402/{ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json => 2d18fd88-73b5-4d4c-a1cc-e66a20316605.json} (90%)
 rename data/helm_mmlu/mistralai/mistral-large-2407/{7517b6c9-c613-416c-aadb-39fd6d252da7.json => 567918be-be6f-4e41-b613-727828fe8a44.json} (90%)
 rename data/helm_mmlu/mistralai/mistral-small-2402/{85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json => c2be131b-808c-4947-b24f-69ef6af499d7.json} (90%)
 rename data/helm_mmlu/mistralai/mixtral-8x22b/{df568c3c-8a5c-4455-836d-c980d7f5ea5c.json => 24955250-a2e9-475f-a866-30a835579e03.json} (90%)
 rename data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/{96e24977-ca6d-402c-bfd8-62be4cd9b902.json => de6f7e19-b54a-4bd3-b624-29f66afbee15.json} (90%)
 rename data/helm_mmlu/mistralai/open-mistral-nemo-2407/{e5b2636a-8438-40c0-9f89-9f35585bf740.json => e4c3032d-04e0-414b-a7e9-e30756d82000.json} (90%)
 rename data/helm_mmlu/openai/gpt-3.5-turbo-0125/{f3259d92-3c95-4b78-81ae-f7f4b80aec63.json => e9a41d4b-56c7-47f0-b439-72ad1e463000.json} (90%)
 rename data/helm_mmlu/openai/gpt-3.5-turbo-0613/{5ba23a34-4232-487f-b3e9-326d776135be.json => a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json} (90%)
 rename data/helm_mmlu/openai/gpt-4-0613/{5bc1a462-f753-4259-91c3-a549491b2986.json => fd6aea24-dc18-41ce-bc19-23f461a39032.json} (90%)
 rename data/helm_mmlu/openai/gpt-4-1106-preview/{16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json => 625d33ce-a320-4bfd-a962-451b8c22d392.json} (90%)
 rename data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/{dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json => e51be257-610e-4d38-b58a-a3b29fc06a83.json} (90%)
 rename data/helm_mmlu/openai/gpt-4o-2024-05-13/{2ca11d4c-52e6-49ea-a5cb-238c0313c483.json => 9e0b9f48-f913-4bbe-a135-59e596c9e479.json} (90%)
 rename data/helm_mmlu/openai/gpt-4o-2024-08-06/{de400624-6c2e-47af-b851-54c4075c30ee.json => 189e6cc5-1c8f-4712-8dda-c108f18f836d.json} (90%)
 rename data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/{34441b3b-4d66-444c-af85-ca0666a48ed4.json => 4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json} (90%)
 rename data/helm_mmlu/qwen/qwen1.5-110b-chat/{eecf5e40-9110-47ea-a72b-9ba587b96e30.json => ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json} (90%)
 rename data/helm_mmlu/qwen/qwen1.5-14b/{f26fb123-c214-4d18-aea8-b05b4ea1819b.json => fa6a6772-671b-402e-9480-d61e0fb4a61e.json} (90%)
 rename data/helm_mmlu/qwen/qwen1.5-32b/{30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json => b5279e94-ae7f-4671-9315-874e162a24fd.json} (90%)
 rename data/helm_mmlu/qwen/qwen1.5-72b/{b152cd5c-cbc0-48f4-ba37-16878c3afba1.json => de00e8da-9c83-40df-b642-b94719ce1ac2.json} (90%)
 rename data/helm_mmlu/qwen/qwen1.5-7b/{dac223e9-3073-46f9-924b-c5a6408f5da9.json => 119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json} (90%)
 rename data/helm_mmlu/qwen/qwen2-72b-instruct/{a7a218ff-7afe-417c-ac39-cf305d592d56.json => 80aabdf4-60b7-493b-98d8-1854f1c41c10.json} (90%)
 rename data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/{2e165735-43b8-4317-9cde-35aa4b5bcb26.json => 29958cee-32c9-4d51-8f14-72db4273459f.json} (90%)
 rename data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/{15c25bc5-7b1e-4771-bda2-fd04d74e1463.json => 72537b16-feda-4e5e-a477-f415650db847.json} (90%)
 rename data/helm_mmlu/snowflake/snowflake-arctic-instruct/{26036c7c-e981-46e8-b5e9-dcd7d116af70.json => 7df68af5-667a-4125-9c12-e71fb5af0a74.json} (90%)
 rename data/helm_mmlu/upstage/solar-pro-241126/{b3269e4e-98a7-4795-8ef3-fc87774a54b7.json => 1845eb8b-4c94-4d22-8771-012f7230dc62.json} (90%)
 rename data/helm_mmlu/writer/palmyra-x-004/{284fde9f-8570-4e6d-9190-e52d8723fe57.json => b2c8cfd1-f09a-4616-8038-c7e1930bce74.json} (90%)
 rename data/helm_mmlu/writer/palmyra-x-v3/{fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json => 12976629-cefe-4329-b974-bb17f88d385d.json} (90%)

diff --git a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/bd982107-7c03-4ee8-8a38-782d68883818.json b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json
similarity index 92%
rename from data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/bd982107-7c03-4ee8-8a38-782d68883818.json
rename to data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json
index 28c2132cc..8176fa91a 100644
--- a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/bd982107-7c03-4ee8-8a38-782d68883818.json
+++ b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,7 +171,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -180,7 +180,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -230,7 +230,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -239,7 +239,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -290,7 +290,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -299,7 +299,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/25aa6e41-ab16-4f63-9613-bfb83b9151c5.json b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json
similarity index 92%
rename from data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/25aa6e41-ab16-4f63-9613-bfb83b9151c5.json
rename to data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json
index c2c0ac804..4d2b264af 100644
--- a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/25aa6e41-ab16-4f63-9613-bfb83b9151c5.json
+++ b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,7 +171,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -180,7 +180,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -230,7 +230,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -239,7 +239,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -290,7 +290,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -299,7 +299,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/ddd52881-1248-4652-9f1d-5d2b58ede889.json b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json
similarity index 92%
rename from data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/ddd52881-1248-4652-9f1d-5d2b58ede889.json
rename to data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json
index cbc2ce18e..39fbc0d1c 100644
--- a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/ddd52881-1248-4652-9f1d-5d2b58ede889.json
+++ b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,7 +171,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -180,7 +180,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -230,7 +230,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -239,7 +239,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -290,7 +290,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -299,7 +299,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/365bc693-73b6-41fe-a8fa-eba7b91febe0.json b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json
similarity index 92%
rename from data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/365bc693-73b6-41fe-a8fa-eba7b91febe0.json
rename to data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json
index 4bae095b1..99d31c069 100644
--- a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/365bc693-73b6-41fe-a8fa-eba7b91febe0.json
+++ b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,7 +171,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -180,7 +180,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -230,7 +230,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -239,7 +239,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -290,7 +290,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -299,7 +299,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/amazon/nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json b/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json
similarity index 92%
rename from data/helm_capabilities/amazon/nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json
rename to data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json
index f34e2fca2..c786f36c7 100644
--- a/data/helm_capabilities/amazon/nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json
+++ b/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/amazon/nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json b/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json
similarity index 92%
rename from data/helm_capabilities/amazon/nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json
rename to data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json
index da4fca4b9..6219cdf47 100644
--- a/data/helm_capabilities/amazon/nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json
+++ b/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/amazon/nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json b/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json
similarity index 92%
rename from data/helm_capabilities/amazon/nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json
rename to data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json
index 7d306af4a..d9f1bd857 100644
--- a/data/helm_capabilities/amazon/nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json
+++ b/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/amazon/nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json b/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json
similarity index 92%
rename from data/helm_capabilities/amazon/nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json
rename to data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json
index 9634c0423..658945ff5 100644
--- a/data/helm_capabilities/amazon/nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json
+++ b/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/f350d9d1-b743-4017-bc68-a4dc726515d0.json b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json
similarity index 92%
rename from data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/f350d9d1-b743-4017-bc68-a4dc726515d0.json
rename to data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json
index 59583f434..d63e271d1 100644
--- a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/f350d9d1-b743-4017-bc68-a4dc726515d0.json
+++ b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c32a1f0a-bf8a-42be-b155-4f87465235bc.json b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json
similarity index 92%
rename from data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c32a1f0a-bf8a-42be-b155-4f87465235bc.json
rename to data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json
index 050628b1e..c53a3aa66 100644
--- a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c32a1f0a-bf8a-42be-b155-4f87465235bc.json
+++ b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/96cfde1b-77de-4d2a-8b45-938116795108.json b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json
similarity index 92%
rename from data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/96cfde1b-77de-4d2a-8b45-938116795108.json
rename to data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json
index 325dd380e..1f5c52f66 100644
--- a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/96cfde1b-77de-4d2a-8b45-938116795108.json
+++ b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json
similarity index 92%
rename from data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json
rename to data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json
index 82dc8fad1..da15e55a7 100644
--- a/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json
+++ b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-haiku-4-5-20251001/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/anthropic_claude-haiku-4-5-20251001/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/d633fcd6-eb01-49ff-ba7c-6ca12734746f.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json
similarity index 92%
rename from data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/d633fcd6-eb01-49ff-ba7c-6ca12734746f.json
rename to data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json
index 0e6c52fbd..c554c6a65 100644
--- a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/d633fcd6-eb01-49ff-ba7c-6ca12734746f.json
+++ b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514/7a7b49ff-5060-4d12-acb9-607125fbe081.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json
similarity index 92%
rename from data/helm_capabilities/anthropic/claude-opus-4-20250514/7a7b49ff-5060-4d12-acb9-607125fbe081.json
rename to data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json
index 7abaf15ac..240e9ebf4 100644
--- a/data/helm_capabilities/anthropic/claude-opus-4-20250514/7a7b49ff-5060-4d12-acb9-607125fbe081.json
+++ b/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/287a3646-d969-4bd9-9773-86463c1ba87f.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json
similarity index 92%
rename from data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/287a3646-d969-4bd9-9773-86463c1ba87f.json
rename to data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json
index f65747fef..ecc6c0f0a 100644
--- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/287a3646-d969-4bd9-9773-86463c1ba87f.json
+++ b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/97f3892f-9588-49ef-abef-3a0c965bb352.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json
similarity index 92%
rename from data/helm_capabilities/anthropic/claude-sonnet-4-20250514/97f3892f-9588-49ef-abef-3a0c965bb352.json
rename to data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json
index 98193fa4e..b4413ccdd 100644
--- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/97f3892f-9588-49ef-abef-3a0c965bb352.json
+++ b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json
similarity index 92%
rename from data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json
rename to data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json
index 3583acbb0..e0991c0d9 100644
--- a/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json
+++ b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-5-20250929/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-5-20250929/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/9e5684dc-6380-4353-b966-7205d66340fa.json b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json
similarity index 92%
rename from data/helm_capabilities/deepseek-ai/deepseek-r1-0528/9e5684dc-6380-4353-b966-7205d66340fa.json
rename to data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json
index 6cc5a7f14..682cc94cc 100644
--- a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/9e5684dc-6380-4353-b966-7205d66340fa.json
+++ b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/deepseek-ai/deepseek-v3/1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json b/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json
similarity index 92%
rename from data/helm_capabilities/deepseek-ai/deepseek-v3/1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json
rename to data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json
index 46c4843d4..3b034de70 100644
--- a/data/helm_capabilities/deepseek-ai/deepseek-v3/1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json
+++ b/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/google/gemini-1.5-flash-002/20512a3b-ac0f-483a-8bec-9962980c579c.json b/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json
similarity index 92%
rename from data/helm_capabilities/google/gemini-1.5-flash-002/20512a3b-ac0f-483a-8bec-9962980c579c.json
rename to data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json
index 26e2e73d6..7d4281de4 100644
--- a/data/helm_capabilities/google/gemini-1.5-flash-002/20512a3b-ac0f-483a-8bec-9962980c579c.json
+++ b/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/google/gemini-1.5-pro-002/704c5c74-a0ee-457d-9b4e-3ae895ffc105.json b/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json
similarity index 92%
rename from data/helm_capabilities/google/gemini-1.5-pro-002/704c5c74-a0ee-457d-9b4e-3ae895ffc105.json
rename to data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json
index 1157dc164..3c438fd59 100644
--- a/data/helm_capabilities/google/gemini-1.5-pro-002/704c5c74-a0ee-457d-9b4e-3ae895ffc105.json
+++ b/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/google/gemini-2.0-flash-001/eb9224b8-0edb-4605-a2ee-cfb63f41370e.json b/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json
similarity index 92%
rename from data/helm_capabilities/google/gemini-2.0-flash-001/eb9224b8-0edb-4605-a2ee-cfb63f41370e.json
rename to data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json
index 68450c9bd..7f589b967 100644
--- a/data/helm_capabilities/google/gemini-2.0-flash-001/eb9224b8-0edb-4605-a2ee-cfb63f41370e.json
+++ b/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/4cb58f80-c2b1-45c6-b781-19af47660eb0.json b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json
similarity index 91%
rename from data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/4cb58f80-c2b1-45c6-b781-19af47660eb0.json
rename to data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json
index 1bc6a5842..0376cdf40 100644
--- a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/4cb58f80-c2b1-45c6-b781-19af47660eb0.json
+++ b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/google/gemini-2.5-flash-lite/6307e0c4-c983-4257-82d8-b2a50171eb8a.json b/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json
similarity index 92%
rename from data/helm_capabilities/google/gemini-2.5-flash-lite/6307e0c4-c983-4257-82d8-b2a50171eb8a.json
rename to data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json
index f9f820a96..600681fbb 100644
--- a/data/helm_capabilities/google/gemini-2.5-flash-lite/6307e0c4-c983-4257-82d8-b2a50171eb8a.json
+++ b/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/275cd615-bddf-4afe-a499-b463fe183486.json b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json
similarity index 92%
rename from data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/275cd615-bddf-4afe-a499-b463fe183486.json
rename to data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json
index 7f7987a29..221dc7a91 100644
--- a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/275cd615-bddf-4afe-a499-b463fe183486.json
+++ b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/03b48360-a387-44ba-94b2-2eb7c234a9fa.json b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json
similarity index 92%
rename from data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/03b48360-a387-44ba-94b2-2eb7c234a9fa.json
rename to data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json
index c845227fa..355cd3bc1 100644
--- a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/03b48360-a387-44ba-94b2-2eb7c234a9fa.json
+++ b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json b/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json
similarity index 92%
rename from data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json
rename to data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json
index e4e82cd5b..d3ecb3ebb 100644
--- a/data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json
+++ b/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-3-pro-preview/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/google_gemini-3-pro-preview/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5e5720d0-67fe-40a9-b65b-d4154848d83c.json b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json
similarity index 92%
rename from data/helm_capabilities/ibm/granite-3.3-8b-instruct/5e5720d0-67fe-40a9-b65b-d4154848d83c.json
rename to data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json
index 828363b5a..869902b9d 100644
--- a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5e5720d0-67fe-40a9-b65b-d4154848d83c.json
+++ b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json b/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json
similarity index 92%
rename from data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json
rename to data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json
index 8203eb4c6..03bc0f0f8 100644
--- a/data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json
+++ b/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/ibm_granite-4.0-h-small/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/ibm_granite-4.0-h-small/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json b/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json
similarity index 92%
rename from data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json
rename to data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json
index bfe399026..399dbb1e3 100644
--- a/data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json
+++ b/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/ibm_granite-4.0-micro/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/ibm_granite-4.0-micro/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/marin-community/marin-8b-instruct/aba1fded-b031-48df-87ef-dc744df33501.json b/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json
similarity index 92%
rename from data/helm_capabilities/marin-community/marin-8b-instruct/aba1fded-b031-48df-87ef-dc744df33501.json
rename to data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json
index 215be80f3..736686c13 100644
--- a/data/helm_capabilities/marin-community/marin-8b-instruct/aba1fded-b031-48df-87ef-dc744df33501.json
+++ b/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -171,7 +171,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -180,7 +180,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -230,7 +230,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -239,7 +239,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -290,7 +290,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -299,7 +299,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/98f69aa6-b227-4076-a76e-1293cbe1c6cb.json b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json
similarity index 92%
rename from data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/98f69aa6-b227-4076-a76e-1293cbe1c6cb.json
rename to data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json
index 41fd4d1af..4dd5465a5 100644
--- a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/98f69aa6-b227-4076-a76e-1293cbe1c6cb.json
+++ b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/d2bb087e-a275-4fce-b6dc-001fd4545883.json b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json
similarity index 92%
rename from data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/d2bb087e-a275-4fce-b6dc-001fd4545883.json
rename to data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json
index 7e6e617b7..407242cbb 100644
--- a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/d2bb087e-a275-4fce-b6dc-001fd4545883.json
+++ b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json
similarity index 92%
rename from data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json
rename to data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json
index 0c2bb79e7..30524d64b 100644
--- a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json
+++ b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json
similarity index 92%
rename from data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json
rename to data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json
index 71c8e88c3..d9ca75120 100644
--- a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json
+++ b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/9cab3a77-4f32-48d0-ba11-e2323ccc4861.json b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json
similarity index 92%
rename from data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/9cab3a77-4f32-48d0-ba11-e2323ccc4861.json
rename to data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json
index 35aef174b..640472423 100644
--- a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/9cab3a77-4f32-48d0-ba11-e2323ccc4861.json
+++ b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9e037c92-1253-49be-b31a-3aa017531d77.json b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json
similarity index 92%
rename from data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9e037c92-1253-49be-b31a-3aa017531d77.json
rename to data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json
index ee064ad73..0b19a4ab4 100644
--- a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9e037c92-1253-49be-b31a-3aa017531d77.json
+++ b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/mistralai/mistral-large-2411/bd26c7cb-ce76-4b17-b617-d1d93a168c93.json b/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json
similarity index 92%
rename from data/helm_capabilities/mistralai/mistral-large-2411/bd26c7cb-ce76-4b17-b617-d1d93a168c93.json
rename to data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json
index f4fd3ec06..dec52ca8a 100644
--- a/data/helm_capabilities/mistralai/mistral-large-2411/bd26c7cb-ce76-4b17-b617-d1d93a168c93.json
+++ b/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/mistralai/mistral-small-2503/9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json b/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json
similarity index 92%
rename from data/helm_capabilities/mistralai/mistral-small-2503/9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json
rename to data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json
index ff90f0105..7999b823d 100644
--- a/data/helm_capabilities/mistralai/mistral-small-2503/9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json
+++ b/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/d69a1cbe-353c-4be9-b93b-5224d24c7adf.json b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json
similarity index 92%
rename from data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/d69a1cbe-353c-4be9-b93b-5224d24c7adf.json
rename to data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json
index 703963331..583f7956f 100644
--- a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/d69a1cbe-353c-4be9-b93b-5224d24c7adf.json
+++ b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/915cb39d-f21f-4ef1-a95f-f44f79ede893.json b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json
similarity index 92%
rename from data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/915cb39d-f21f-4ef1-a95f-f44f79ede893.json
rename to data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json
index c522fd879..d2c9cfb4e 100644
--- a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/915cb39d-f21f-4ef1-a95f-f44f79ede893.json
+++ b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/moonshotai/kimi-k2-instruct/fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json b/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json
similarity index 92%
rename from data/helm_capabilities/moonshotai/kimi-k2-instruct/fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json
rename to data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json
index b69be21a9..1946db617 100644
--- a/data/helm_capabilities/moonshotai/kimi-k2-instruct/fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json
+++ b/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/eb51f418-6abf-4b2c-9f57-0b830c00bd15.json b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json
similarity index 92%
rename from data/helm_capabilities/openai/gpt-4.1-2025-04-14/eb51f418-6abf-4b2c-9f57-0b830c00bd15.json
rename to data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json
index 17443bc6f..3c36cb01b 100644
--- a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/eb51f418-6abf-4b2c-9f57-0b830c00bd15.json
+++ b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/41cd14b0-46ba-49da-844a-19fe866bef1e.json b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json
similarity index 92%
rename from data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/41cd14b0-46ba-49da-844a-19fe866bef1e.json
rename to data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json
index 0342d7835..dd4503511 100644
--- a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/41cd14b0-46ba-49da-844a-19fe866bef1e.json
+++ b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/7de93642-a4bc-430b-8733-9befeb6a0e23.json b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json
similarity index 92%
rename from data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/7de93642-a4bc-430b-8733-9befeb6a0e23.json
rename to data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json
index 15a7d0356..e2550958a 100644
--- a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/7de93642-a4bc-430b-8733-9befeb6a0e23.json
+++ b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/gpt-4o-2024-11-20/4f18292a-1fef-4feb-9b17-045c96e3e137.json b/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json
similarity index 92%
rename from data/helm_capabilities/openai/gpt-4o-2024-11-20/4f18292a-1fef-4feb-9b17-045c96e3e137.json
rename to data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json
index ed5380bd3..3c3d40256 100644
--- a/data/helm_capabilities/openai/gpt-4o-2024-11-20/4f18292a-1fef-4feb-9b17-045c96e3e137.json
+++ b/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7458c032-b24d-4f13-a659-b6e19d19a8e1.json b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json
similarity index 92%
rename from data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7458c032-b24d-4f13-a659-b6e19d19a8e1.json
rename to data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json
index e38c0ac88..778449e6e 100644
--- a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7458c032-b24d-4f13-a659-b6e19d19a8e1.json
+++ b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/gpt-5-2025-08-07/21eb1648-aad0-4297-9edc-c445e4c38694.json b/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json
similarity index 92%
rename from data/helm_capabilities/openai/gpt-5-2025-08-07/21eb1648-aad0-4297-9edc-c445e4c38694.json
rename to data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json
index fb85b633b..95d9762ef 100644
--- a/data/helm_capabilities/openai/gpt-5-2025-08-07/21eb1648-aad0-4297-9edc-c445e4c38694.json
+++ b/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/99d657ae-e850-4caf-a599-13f1b8072273.json b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json
similarity index 92%
rename from data/helm_capabilities/openai/gpt-5-mini-2025-08-07/99d657ae-e850-4caf-a599-13f1b8072273.json
rename to data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json
index 3ca436502..5dc165206 100644
--- a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/99d657ae-e850-4caf-a599-13f1b8072273.json
+++ b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/10cd766e-442c-4b3d-833b-740417d9a6d9.json b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json
similarity index 92%
rename from data/helm_capabilities/openai/gpt-5-nano-2025-08-07/10cd766e-442c-4b3d-833b-740417d9a6d9.json
rename to data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json
index e271e8724..096518c62 100644
--- a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/10cd766e-442c-4b3d-833b-740417d9a6d9.json
+++ b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json
similarity index 92%
rename from data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json
rename to data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json
index 492db1047..738007852 100644
--- a/data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json
+++ b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5.1-2025-11-13/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_gpt-5.1-2025-11-13/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/gpt-oss-120b/06719cd4-5654-49b6-9dee-e112d1601d1c.json b/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json
similarity index 92%
rename from data/helm_capabilities/openai/gpt-oss-120b/06719cd4-5654-49b6-9dee-e112d1601d1c.json
rename to data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json
index 13795ec21..8642e9954 100644
--- a/data/helm_capabilities/openai/gpt-oss-120b/06719cd4-5654-49b6-9dee-e112d1601d1c.json
+++ b/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/gpt-oss-20b/ed849999-48c2-4569-8bcd-dc73084e3197.json b/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json
similarity index 91%
rename from data/helm_capabilities/openai/gpt-oss-20b/ed849999-48c2-4569-8bcd-dc73084e3197.json
rename to data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json
index d2f755b28..5112d535f 100644
--- a/data/helm_capabilities/openai/gpt-oss-20b/ed849999-48c2-4569-8bcd-dc73084e3197.json
+++ b/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/o3-2025-04-16/01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json b/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json
similarity index 92%
rename from data/helm_capabilities/openai/o3-2025-04-16/01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json
rename to data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json
index 7455567bf..677721448 100644
--- a/data/helm_capabilities/openai/o3-2025-04-16/01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json
+++ b/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/openai/o4-mini-2025-04-16/32382d69-21c7-43a9-bb95-27607ec18cc9.json b/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json
similarity index 92%
rename from data/helm_capabilities/openai/o4-mini-2025-04-16/32382d69-21c7-43a9-bb95-27607ec18cc9.json
rename to data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json
index c33228ef1..fd4ae16c5 100644
--- a/data/helm_capabilities/openai/o4-mini-2025-04-16/32382d69-21c7-43a9-bb95-27607ec18cc9.json
+++ b/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/77e702f7-37ef-4487-b047-74b13ef6d966.json b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json
similarity index 92%
rename from data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/77e702f7-37ef-4487-b047-74b13ef6d966.json
rename to data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json
index 31467bc1e..50778c699 100644
--- a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/77e702f7-37ef-4487-b047-74b13ef6d966.json
+++ b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/4ee3c647-740c-41a6-ac66-4a38b09317ff.json b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json
similarity index 92%
rename from data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/4ee3c647-740c-41a6-ac66-4a38b09317ff.json
rename to data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json
index 0ac7225b8..c974f1019 100644
--- a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/4ee3c647-740c-41a6-ac66-4a38b09317ff.json
+++ b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/ca30726a-00a6-4228-94fe-5dce00de1d5e.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json
similarity index 92%
rename from data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/ca30726a-00a6-4228-94fe-5dce00de1d5e.json
rename to data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json
index 1d36e4190..9ded60c84 100644
--- a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/ca30726a-00a6-4228-94fe-5dce00de1d5e.json
+++ b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/7862890a-298b-4bda-b8f1-7be6a5779365.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json
similarity index 92%
rename from data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/7862890a-298b-4bda-b8f1-7be6a5779365.json
rename to data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json
index 04fc2f6cc..0210712c3 100644
--- a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/7862890a-298b-4bda-b8f1-7be6a5779365.json
+++ b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json
similarity index 92%
rename from data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json
rename to data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json
index bbcecd669..6ee69548e 100644
--- a/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json
+++ b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen3-next-80b-a3b-thinking/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/qwen_qwen3-next-80b-a3b-thinking/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/writer/palmyra-fin/442aed0d-95c3-4436-ad63-b7b1e93307f4.json b/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json
similarity index 91%
rename from data/helm_capabilities/writer/palmyra-fin/442aed0d-95c3-4436-ad63-b7b1e93307f4.json
rename to data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json
index da11997be..b86fc5b45 100644
--- a/data/helm_capabilities/writer/palmyra-fin/442aed0d-95c3-4436-ad63-b7b1e93307f4.json
+++ b/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-fin/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/writer_palmyra-fin/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/writer/palmyra-med/7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json b/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json
similarity index 91%
rename from data/helm_capabilities/writer/palmyra-med/7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json
rename to data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json
index 78088b82e..ac68f722a 100644
--- a/data/helm_capabilities/writer/palmyra-med/7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json
+++ b/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-med/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/writer_palmyra-med/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/writer/palmyra-x-004/bc2c91e0-6afd-4e44-b665-d5c7558f8981.json b/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json
similarity index 92%
rename from data/helm_capabilities/writer/palmyra-x-004/bc2c91e0-6afd-4e44-b665-d5c7558f8981.json
rename to data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json
index b630b3cd0..9398b6319 100644
--- a/data/helm_capabilities/writer/palmyra-x-004/bc2c91e0-6afd-4e44-b665-d5c7558f8981.json
+++ b/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/writer/palmyra-x5/a74b74f7-ccce-4341-a122-26728cc6bece.json b/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json
similarity index 91%
rename from data/helm_capabilities/writer/palmyra-x5/a74b74f7-ccce-4341-a122-26728cc6bece.json
rename to data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json
index c212295f1..6d3707107 100644
--- a/data/helm_capabilities/writer/palmyra-x5/a74b74f7-ccce-4341-a122-26728cc6bece.json
+++ b/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-x5/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/writer_palmyra-x5/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/xai/grok-3-beta/87811b75-afe8-413b-949d-7fd1f582a2e8.json b/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json
similarity index 91%
rename from data/helm_capabilities/xai/grok-3-beta/87811b75-afe8-413b-949d-7fd1f582a2e8.json
rename to data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json
index 34f4be43b..54503d043 100644
--- a/data/helm_capabilities/xai/grok-3-beta/87811b75-afe8-413b-949d-7fd1f582a2e8.json
+++ b/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/xai_grok-3-beta/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/xai_grok-3-beta/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/xai/grok-3-mini-beta/ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json b/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json
similarity index 92%
rename from data/helm_capabilities/xai/grok-3-mini-beta/ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json
rename to data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json
index 31dab978e..a083c0183 100644
--- a/data/helm_capabilities/xai/grok-3-mini-beta/ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json
+++ b/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/xai/grok-4-0709/924080a0-c530-4e6d-b1a4-107de3bd7183.json b/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json
similarity index 91%
rename from data/helm_capabilities/xai/grok-4-0709/924080a0-c530-4e6d-b1a4-107de3bd7183.json
rename to data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json
index 872c4f1f5..a25562cb1 100644
--- a/data/helm_capabilities/xai/grok-4-0709/924080a0-c530-4e6d-b1a4-107de3bd7183.json
+++ b/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/xai_grok-4-0709/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/xai_grok-4-0709/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/be23c720-a99a-4945-bc0b-ddc27c8eec39.json b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json
similarity index 92%
rename from data/helm_capabilities/zai-org/glm-4.5-air-fp8/be23c720-a99a-4945-bc0b-ddc27c8eec39.json
rename to data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json
index 4ead0f554..43a98dd63 100644
--- a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/be23c720-a99a-4945-bc0b-ddc27c8eec39.json
+++ b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1770830201.581632",
-  "retrieved_timestamp": "1770830201.581632",
+  "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1770835969.095764",
+  "retrieved_timestamp": "1770835969.095764",
   "source_metadata": {
     "source_name": "helm_capabilities",
     "source_type": "documentation",
@@ -47,7 +47,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU-Pro - COT correct",
+      "evaluation_name": "MMLU-Pro",
       "source_data": {
         "dataset_name": "MMLU-Pro",
         "source_type": "url",
@@ -56,7 +56,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on MMLU-Pro",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -108,7 +108,7 @@
       }
     },
     {
-      "evaluation_name": "GPQA - COT correct",
+      "evaluation_name": "GPQA",
       "source_data": {
         "dataset_name": "GPQA",
         "source_type": "url",
@@ -117,7 +117,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought",
+        "evaluation_description": "COT correct on GPQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -169,7 +169,7 @@
       }
     },
     {
-      "evaluation_name": "IFEval - IFEval Strict Acc",
+      "evaluation_name": "IFEval",
       "source_data": {
         "dataset_name": "IFEval",
         "source_type": "url",
@@ -178,7 +178,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.",
+        "evaluation_description": "IFEval Strict Acc on IFEval",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -226,7 +226,7 @@
       }
     },
     {
-      "evaluation_name": "WildBench - WB Score",
+      "evaluation_name": "WildBench",
       "source_data": {
         "dataset_name": "WildBench",
         "source_type": "url",
@@ -235,7 +235,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
+        "evaluation_description": "WB Score on WildBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -285,7 +285,7 @@
       }
     },
     {
-      "evaluation_name": "Omni-MATH - Acc",
+      "evaluation_name": "Omni-MATH",
       "source_data": {
         "dataset_name": "Omni-MATH",
         "source_type": "url",
@@ -294,7 +294,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.",
+        "evaluation_description": "Acc on Omni-MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json
similarity index 91%
rename from data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json
rename to data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json
index 58e2410df..152223193 100644
--- a/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json
+++ b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/Anthropic-LM-v4-s3-52B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/Anthropic-LM-v4-s3-52B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/ai21/J1-Grande-v1-17B/c12a8494-bafc-4097-874a-7c00636e96f8.json b/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json
similarity index 92%
rename from data/helm_classic/ai21/J1-Grande-v1-17B/c12a8494-bafc-4097-874a-7c00636e96f8.json
rename to data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json
index 4805e7ac8..6a9a41b41 100644
--- a/data/helm_classic/ai21/J1-Grande-v1-17B/c12a8494-bafc-4097-874a-7c00636e96f8.json
+++ b/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json
similarity index 92%
rename from data/helm_classic/ai21/J1-Grande-v2-beta-17B/4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json
rename to data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json
index e47585440..30c92ab94 100644
--- a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json
+++ b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/ai21/J1-Jumbo-v1-178B/19f61327-fcc3-408f-9254-2d6a2aadcd4e.json b/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json
similarity index 92%
rename from data/helm_classic/ai21/J1-Jumbo-v1-178B/19f61327-fcc3-408f-9254-2d6a2aadcd4e.json
rename to data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json
index bfd78fa42..df8111bcc 100644
--- a/data/helm_classic/ai21/J1-Jumbo-v1-178B/19f61327-fcc3-408f-9254-2d6a2aadcd4e.json
+++ b/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/ai21/J1-Large-v1-7.5B/ccc17d56-bd26-409c-ac3f-262eaba9ce21.json b/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json
similarity index 92%
rename from data/helm_classic/ai21/J1-Large-v1-7.5B/ccc17d56-bd26-409c-ac3f-262eaba9ce21.json
rename to data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json
index a1c2d2860..5c8560533 100644
--- a/data/helm_classic/ai21/J1-Large-v1-7.5B/ccc17d56-bd26-409c-ac3f-262eaba9ce21.json
+++ b/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/ai21/Jurassic-2-Grande-17B/f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json b/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json
similarity index 92%
rename from data/helm_classic/ai21/Jurassic-2-Grande-17B/f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json
rename to data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json
index e9db23ac9..4f288f894 100644
--- a/data/helm_classic/ai21/Jurassic-2-Grande-17B/f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json
+++ b/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json
similarity index 92%
rename from data/helm_classic/ai21/Jurassic-2-Jumbo-178B/9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json
rename to data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json
index 38cd07e2a..6d0308b9f 100644
--- a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json
+++ b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/f25c142c-8730-4241-a649-01d076e1f28d.json b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json
similarity index 91%
rename from data/helm_classic/ai21/Jurassic-2-Large-7.5B/f25c142c-8730-4241-a649-01d076e1f28d.json
rename to data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json
index 589346e15..4278cef81 100644
--- a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/f25c142c-8730-4241-a649-01d076e1f28d.json
+++ b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/aleph-alpha/Luminous-Base-13B/ab34f23e-36db-40c0-9681-f30b00692f98.json b/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json
similarity index 91%
rename from data/helm_classic/aleph-alpha/Luminous-Base-13B/ab34f23e-36db-40c0-9681-f30b00692f98.json
rename to data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json
index 371a206a5..7e02805f7 100644
--- a/data/helm_classic/aleph-alpha/Luminous-Base-13B/ab34f23e-36db-40c0-9681-f30b00692f98.json
+++ b/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/67281534-a03d-49d8-a586-25cb1a03134e.json b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json
similarity index 91%
rename from data/helm_classic/aleph-alpha/Luminous-Extended-30B/67281534-a03d-49d8-a586-25cb1a03134e.json
rename to data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json
index 715673aae..d6f8fa8ea 100644
--- a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/67281534-a03d-49d8-a586-25cb1a03134e.json
+++ b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json
similarity index 91%
rename from data/helm_classic/aleph-alpha/Luminous-Supreme-70B/3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json
rename to data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json
index 5f8731441..5680298fb 100644
--- a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json
+++ b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/bigscience/BLOOM-176B/04ce2ba4-c382-4658-ba06-1def9499a243.json b/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json
similarity index 91%
rename from data/helm_classic/bigscience/BLOOM-176B/04ce2ba4-c382-4658-ba06-1def9499a243.json
rename to data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json
index 04305416d..caffd542e 100644
--- a/data/helm_classic/bigscience/BLOOM-176B/04ce2ba4-c382-4658-ba06-1def9499a243.json
+++ b/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/bigscience/T0pp-11B/3a546396-d031-4958-8410-00e0d3406089.json b/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json
similarity index 93%
rename from data/helm_classic/bigscience/T0pp-11B/3a546396-d031-4958-8410-00e0d3406089.json
rename to data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json
index 1bbeba7ff..400f064d5 100644
--- a/data/helm_classic/bigscience/T0pp-11B/3a546396-d031-4958-8410-00e0d3406089.json
+++ b/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/bigscience_T0pp-11B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/bigscience_T0pp-11B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/e7b99aa6-08e8-4224-a805-16586eb44325.json b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-Command-beta-52.4B/e7b99aa6-08e8-4224-a805-16586eb44325.json
rename to data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json
index fadfb62da..25f29c7e2 100644
--- a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/e7b99aa6-08e8-4224-a805-16586eb44325.json
+++ b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/43a3fe19-929a-463d-a0ed-791dad765188.json b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-Command-beta-6.1B/43a3fe19-929a-463d-a0ed-791dad765188.json
rename to data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json
index b1c061a45..8f01acff1 100644
--- a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/43a3fe19-929a-463d-a0ed-791dad765188.json
+++ b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/75468958-b75b-41fe-9813-070b793e86d9.json b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-large-v20220720-13.1B/75468958-b75b-41fe-9813-070b793e86d9.json
rename to data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json
index bd838c107..16c06b937 100644
--- a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/75468958-b75b-41fe-9813-070b793e86d9.json
+++ b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json
rename to data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json
index 3e7a0f6fa..f0d42b850 100644
--- a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json
+++ b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/3c9c425a-ce4a-4958-9744-7f9490ed5729.json b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/3c9c425a-ce4a-4958-9744-7f9490ed5729.json
rename to data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json
index 745f99da6..43f986e70 100644
--- a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/3c9c425a-ce4a-4958-9744-7f9490ed5729.json
+++ b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/cohere/Cohere-small-v20220720-410M/5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json b/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-small-v20220720-410M/5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json
rename to data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json
index 478f77b1d..adaaa9403 100644
--- a/data/helm_classic/cohere/Cohere-small-v20220720-410M/5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json
+++ b/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json
rename to data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json
index 2039d0727..80b637746 100644
--- a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json
+++ b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/f8044c74-3f1c-4562-a21c-e448061b2077.json b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json
similarity index 92%
rename from data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/f8044c74-3f1c-4562-a21c-e448061b2077.json
rename to data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json
index 216532187..cc49de0c7 100644
--- a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/f8044c74-3f1c-4562-a21c-e448061b2077.json
+++ b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json b/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json
similarity index 91%
rename from data/helm_classic/eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json
rename to data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json
index 8f5d16956..bc304945b 100644
--- a/data/helm_classic/eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json
+++ b/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/eleutherai_Pythia-12B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/eleutherai_Pythia-12B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json b/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json
similarity index 91%
rename from data/helm_classic/eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json
rename to data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json
index 20ca16498..511816a71 100644
--- a/data/helm_classic/eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json
+++ b/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/eleutherai_Pythia-6.9B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/eleutherai_Pythia-6.9B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json b/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json
similarity index 91%
rename from data/helm_classic/google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json
rename to data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json
index d36f642d7..8d33e45b6 100644
--- a/data/helm_classic/google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json
+++ b/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/google_Palmyra-X-43B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/google_Palmyra-X-43B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/google/T5-11B/52db5c6d-b54e-401a-880d-8ab41a394bc4.json b/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json
similarity index 91%
rename from data/helm_classic/google/T5-11B/52db5c6d-b54e-401a-880d-8ab41a394bc4.json
rename to data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json
index 0f7601506..2a710defd 100644
--- a/data/helm_classic/google/T5-11B/52db5c6d-b54e-401a-880d-8ab41a394bc4.json
+++ b/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/google_T5-11B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/google_T5-11B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/google/UL2-20B/68becad6-9455-4d3d-8d68-d1b4448598a1.json b/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json
similarity index 91%
rename from data/helm_classic/google/UL2-20B/68becad6-9455-4d3d-8d68-d1b4448598a1.json
rename to data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json
index 70193f3b0..bb571aece 100644
--- a/data/helm_classic/google/UL2-20B/68becad6-9455-4d3d-8d68-d1b4448598a1.json
+++ b/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/google_UL2-20B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/google_UL2-20B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-13B/519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json b/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json
similarity index 91%
rename from data/helm_classic/lmsys/Vicuna-v1.3-13B/519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json
rename to data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json
index 385ac9b25..e1d9662a3 100644
--- a/data/helm_classic/lmsys/Vicuna-v1.3-13B/519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json
+++ b/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-7B/972bc5db-f536-42f9-aa51-83cc2f59b76a.json b/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json
similarity index 91%
rename from data/helm_classic/lmsys/Vicuna-v1.3-7B/972bc5db-f536-42f9-aa51-83cc2f59b76a.json
rename to data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json
index 3de9b1fd2..b03d7afe6 100644
--- a/data/helm_classic/lmsys/Vicuna-v1.3-7B/972bc5db-f536-42f9-aa51-83cc2f59b76a.json
+++ b/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/meta/LLaMA-13B/b2220101-56e0-49d9-a3d1-d3bec769ab97.json b/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json
similarity index 91%
rename from data/helm_classic/meta/LLaMA-13B/b2220101-56e0-49d9-a3d1-d3bec769ab97.json
rename to data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json
index ac2da41ef..959b52195 100644
--- a/data/helm_classic/meta/LLaMA-13B/b2220101-56e0-49d9-a3d1-d3bec769ab97.json
+++ b/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-13B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/meta_LLaMA-13B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/meta/LLaMA-30B/96907b25-05c3-441b-afc4-69274c20bfc3.json b/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json
similarity index 91%
rename from data/helm_classic/meta/LLaMA-30B/96907b25-05c3-441b-afc4-69274c20bfc3.json
rename to data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json
index 1b33fd761..7f604e015 100644
--- a/data/helm_classic/meta/LLaMA-30B/96907b25-05c3-441b-afc4-69274c20bfc3.json
+++ b/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-30B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/meta_LLaMA-30B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/meta/LLaMA-65B/66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json b/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json
similarity index 91%
rename from data/helm_classic/meta/LLaMA-65B/66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json
rename to data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json
index a7d6351b2..ad8c1c451 100644
--- a/data/helm_classic/meta/LLaMA-65B/66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json
+++ b/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-65B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/meta_LLaMA-65B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/meta/LLaMA-7B/70e9e156-6807-489b-b77a-367236614826.json b/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json
similarity index 91%
rename from data/helm_classic/meta/LLaMA-7B/70e9e156-6807-489b-b77a-367236614826.json
rename to data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json
index 79b00a818..152b9e683 100644
--- a/data/helm_classic/meta/LLaMA-7B/70e9e156-6807-489b-b77a-367236614826.json
+++ b/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-7B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/meta_LLaMA-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/meta/Llama-2-13B/e90cfb46-1173-4d22-9329-9bf57cdd5241.json b/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json
similarity index 91%
rename from data/helm_classic/meta/Llama-2-13B/e90cfb46-1173-4d22-9329-9bf57cdd5241.json
rename to data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json
index 170095f5b..f2cd54e60 100644
--- a/data/helm_classic/meta/Llama-2-13B/e90cfb46-1173-4d22-9329-9bf57cdd5241.json
+++ b/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_Llama-2-13B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/meta_Llama-2-13B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/meta/Llama-2-70B/baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json b/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json
similarity index 91%
rename from data/helm_classic/meta/Llama-2-70B/baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json
rename to data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json
index 01e4b1b14..de031e670 100644
--- a/data/helm_classic/meta/Llama-2-70B/baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json
+++ b/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_Llama-2-70B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/meta_Llama-2-70B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/meta/Llama-2-7B/7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json b/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json
similarity index 91%
rename from data/helm_classic/meta/Llama-2-7B/7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json
rename to data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json
index 40b71e7de..eac315fea 100644
--- a/data/helm_classic/meta/Llama-2-7B/7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json
+++ b/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_Llama-2-7B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/meta_Llama-2-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/meta/OPT-175B/ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json b/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json
similarity index 92%
rename from data/helm_classic/meta/OPT-175B/ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json
rename to data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json
index f864b9222..63a0c348d 100644
--- a/data/helm_classic/meta/OPT-175B/ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json
+++ b/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_OPT-175B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/meta_OPT-175B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/meta/OPT-66B/26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json b/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json
similarity index 92%
rename from data/helm_classic/meta/OPT-66B/26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json
rename to data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json
index ba62ce7d1..2f3d2ad96 100644
--- a/data/helm_classic/meta/OPT-66B/26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json
+++ b/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_OPT-66B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/meta_OPT-66B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/microsoft/TNLG-v2-530B/ecd21c26-cdc4-43b1-b933-4d970df9413a.json b/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json
similarity index 91%
rename from data/helm_classic/microsoft/TNLG-v2-530B/ecd21c26-cdc4-43b1-b933-4d970df9413a.json
rename to data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json
index ce5dcad88..ddcfa82ef 100644
--- a/data/helm_classic/microsoft/TNLG-v2-530B/ecd21c26-cdc4-43b1-b933-4d970df9413a.json
+++ b/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/microsoft/TNLG-v2-6.7B/9d4350eb-cdf0-432f-b3b0-45f4832ca950.json b/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json
similarity index 91%
rename from data/helm_classic/microsoft/TNLG-v2-6.7B/9d4350eb-cdf0-432f-b3b0-45f4832ca950.json
rename to data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json
index cfa4e8177..b3f527a04 100644
--- a/data/helm_classic/microsoft/TNLG-v2-6.7B/9d4350eb-cdf0-432f-b3b0-45f4832ca950.json
+++ b/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json b/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json
similarity index 91%
rename from data/helm_classic/mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json
rename to data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json
index 738857e58..1fd56a99f 100644
--- a/data/helm_classic/mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json
+++ b/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/mistralai_Mistral-v0.1-7B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/mistralai_Mistral-v0.1-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/mosaicml/MPT-30B/b277c87e-54b5-466f-97d7-35db4cd7b985.json b/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json
similarity index 91%
rename from data/helm_classic/mosaicml/MPT-30B/b277c87e-54b5-466f-97d7-35db4cd7b985.json
rename to data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json
index 2580877d4..b0d1817b0 100644
--- a/data/helm_classic/mosaicml/MPT-30B/b277c87e-54b5-466f-97d7-35db4cd7b985.json
+++ b/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/mosaicml_MPT-30B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/mosaicml_MPT-30B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/mosaicml/MPT-Instruct-30B/270df23b-9e58-4259-a8ed-0d25b9c80b2a.json b/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json
similarity index 91%
rename from data/helm_classic/mosaicml/MPT-Instruct-30B/270df23b-9e58-4259-a8ed-0d25b9c80b2a.json
rename to data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json
index a7cbf9856..771c4ac02 100644
--- a/data/helm_classic/mosaicml/MPT-Instruct-30B/270df23b-9e58-4259-a8ed-0d25b9c80b2a.json
+++ b/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json b/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json
similarity index 92%
rename from data/helm_classic/openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json
rename to data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json
index c135cdcfb..20a0f0d63 100644
--- a/data/helm_classic/openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json
+++ b/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_GPT-J-6B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_GPT-J-6B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json b/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json
similarity index 92%
rename from data/helm_classic/openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json
rename to data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json
index d4e4c3e18..0c00ea05c 100644
--- a/data/helm_classic/openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json
+++ b/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_GPT-NeoX-20B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_GPT-NeoX-20B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/ada-350M/e6ea5f7e-0533-4a99-8638-1cc10c454238.json b/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json
similarity index 94%
rename from data/helm_classic/openai/ada-350M/e6ea5f7e-0533-4a99-8638-1cc10c454238.json
rename to data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json
index ae351a8ab..5355ce78b 100644
--- a/data/helm_classic/openai/ada-350M/e6ea5f7e-0533-4a99-8638-1cc10c454238.json
+++ b/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_ada-350M/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_ada-350M/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/babbage-1.3B/83c924fe-6318-4bad-adb0-8a81e5e28ee0.json b/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json
similarity index 94%
rename from data/helm_classic/openai/babbage-1.3B/83c924fe-6318-4bad-adb0-8a81e5e28ee0.json
rename to data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json
index 4f76e1f1b..d3977fc36 100644
--- a/data/helm_classic/openai/babbage-1.3B/83c924fe-6318-4bad-adb0-8a81e5e28ee0.json
+++ b/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_babbage-1.3B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_babbage-1.3B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/curie-6.7B/82e2c0e3-66f2-431f-b4b8-d2495970d998.json b/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json
similarity index 94%
rename from data/helm_classic/openai/curie-6.7B/82e2c0e3-66f2-431f-b4b8-d2495970d998.json
rename to data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json
index 84c344282..fe011ca06 100644
--- a/data/helm_classic/openai/curie-6.7B/82e2c0e3-66f2-431f-b4b8-d2495970d998.json
+++ b/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_curie-6.7B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_curie-6.7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/davinci-175B/6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json b/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json
similarity index 94%
rename from data/helm_classic/openai/davinci-175B/6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json
rename to data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json
index fb0de7bd8..b376d2873 100644
--- a/data/helm_classic/openai/davinci-175B/6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json
+++ b/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_davinci-175B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_davinci-175B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0301/e18fbf9e-677c-49fb-ab76-475e8f605f01.json b/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json
similarity index 91%
rename from data/helm_classic/openai/gpt-3.5-turbo-0301/e18fbf9e-677c-49fb-ab76-475e8f605f01.json
rename to data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json
index 582bc2e6a..8051b9b3e 100644
--- a/data/helm_classic/openai/gpt-3.5-turbo-0301/e18fbf9e-677c-49fb-ab76-475e8f605f01.json
+++ b/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0613/039af363-0c5c-4e36-8396-cd57c7e4c1de.json b/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json
similarity index 91%
rename from data/helm_classic/openai/gpt-3.5-turbo-0613/039af363-0c5c-4e36-8396-cd57c7e4c1de.json
rename to data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json
index 5a9810e18..b2682e6f7 100644
--- a/data/helm_classic/openai/gpt-3.5-turbo-0613/039af363-0c5c-4e36-8396-cd57c7e4c1de.json
+++ b/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/text-ada-001/8ea1facb-260a-461d-9271-2c07b318c46f.json b/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json
similarity index 94%
rename from data/helm_classic/openai/text-ada-001/8ea1facb-260a-461d-9271-2c07b318c46f.json
rename to data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json
index 0e0d9602b..43f728bf2 100644
--- a/data/helm_classic/openai/text-ada-001/8ea1facb-260a-461d-9271-2c07b318c46f.json
+++ b/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_text-ada-001/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_text-ada-001/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/text-babbage-001/93007ac9-04c2-451d-abd2-2f235297747e.json b/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json
similarity index 94%
rename from data/helm_classic/openai/text-babbage-001/93007ac9-04c2-451d-abd2-2f235297747e.json
rename to data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json
index 734c00775..fbb4b5bb6 100644
--- a/data/helm_classic/openai/text-babbage-001/93007ac9-04c2-451d-abd2-2f235297747e.json
+++ b/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_text-babbage-001/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_text-babbage-001/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/text-curie-001/b04e5f90-e46e-4d7a-a6a9-569bde072208.json b/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json
similarity index 94%
rename from data/helm_classic/openai/text-curie-001/b04e5f90-e46e-4d7a-a6a9-569bde072208.json
rename to data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json
index ba874427c..4537bcc84 100644
--- a/data/helm_classic/openai/text-curie-001/b04e5f90-e46e-4d7a-a6a9-569bde072208.json
+++ b/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_text-curie-001/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_text-curie-001/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/text-davinci-002/933dc76f-45f0-48e0-93ae-3e19cff87c2a.json b/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json
similarity index 94%
rename from data/helm_classic/openai/text-davinci-002/933dc76f-45f0-48e0-93ae-3e19cff87c2a.json
rename to data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json
index 4555e0f80..0e9fa4947 100644
--- a/data/helm_classic/openai/text-davinci-002/933dc76f-45f0-48e0-93ae-3e19cff87c2a.json
+++ b/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_text-davinci-002/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_text-davinci-002/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/openai/text-davinci-003/b8408a64-eb89-4337-8ee5-3c48e4e24437.json b/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json
similarity index 94%
rename from data/helm_classic/openai/text-davinci-003/b8408a64-eb89-4337-8ee5-3c48e4e24437.json
rename to data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json
index 7fd229e00..9ca831c0f 100644
--- a/data/helm_classic/openai/text-davinci-003/b8408a64-eb89-4337-8ee5-3c48e4e24437.json
+++ b/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_text-davinci-003/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/openai_text-davinci-003/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/stanford/Alpaca-7B/d5846321-0800-4ff9-b85c-53c8b4884ba5.json b/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json
similarity index 91%
rename from data/helm_classic/stanford/Alpaca-7B/d5846321-0800-4ff9-b85c-53c8b4884ba5.json
rename to data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json
index f68731052..cf2a4b297 100644
--- a/data/helm_classic/stanford/Alpaca-7B/d5846321-0800-4ff9-b85c-53c8b4884ba5.json
+++ b/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/stanford_Alpaca-7B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/stanford_Alpaca-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json b/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json
similarity index 91%
rename from data/helm_classic/tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json
rename to data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json
index 85693f897..97f13c6d9 100644
--- a/data/helm_classic/tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json
+++ b/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/tiiuae_Falcon-40B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/tiiuae_Falcon-40B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json b/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json
similarity index 91%
rename from data/helm_classic/tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json
rename to data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json
index e165123de..80c0ac18a 100644
--- a/data/helm_classic/tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json
+++ b/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/tiiuae_Falcon-7B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/tiiuae_Falcon-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json b/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json
similarity index 91%
rename from data/helm_classic/tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json
rename to data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json
index 3c1369c88..4b7c6b681 100644
--- a/data/helm_classic/tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json
+++ b/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-40B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-40B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json
similarity index 91%
rename from data/helm_classic/tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json
rename to data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json
index 19076bf3f..cd7efa818 100644
--- a/data/helm_classic/tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json
+++ b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-7B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-7B/3a329574-dcf6-4177-b37c-c495e6af6cc5.json b/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json
similarity index 91%
rename from data/helm_classic/together/RedPajama-INCITE-Base-7B/3a329574-dcf6-4177-b37c-c495e6af6cc5.json
rename to data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json
index 90ced7618..f25c83f2e 100644
--- a/data/helm_classic/together/RedPajama-INCITE-Base-7B/3a329574-dcf6-4177-b37c-c495e6af6cc5.json
+++ b/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/9e662c1e-e77c-4fb3-b589-127683a4b2ca.json b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json
similarity index 91%
rename from data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/9e662c1e-e77c-4fb3-b589-127683a4b2ca.json
rename to data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json
index 858c06ee0..d4d85552c 100644
--- a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/9e662c1e-e77c-4fb3-b589-127683a4b2ca.json
+++ b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/375140f6-bd3f-4b55-a35c-23de37254296.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json
similarity index 91%
rename from data/helm_classic/together/RedPajama-INCITE-Instruct-7B/375140f6-bd3f-4b55-a35c-23de37254296.json
rename to data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json
index e246416bd..9d60f7506 100644
--- a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/375140f6-bd3f-4b55-a35c-23de37254296.json
+++ b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/021d0b25-8f58-47da-a58c-ac532a7972bf.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json
similarity index 91%
rename from data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/021d0b25-8f58-47da-a58c-ac532a7972bf.json
rename to data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json
index 828ab9683..57ffafd39 100644
--- a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/021d0b25-8f58-47da-a58c-ac532a7972bf.json
+++ b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/writer/InstructPalmyra-30B/9207fec4-d0c4-4f66-b917-f5ed57409215.json b/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json
similarity index 91%
rename from data/helm_classic/writer/InstructPalmyra-30B/9207fec4-d0c4-4f66-b917-f5ed57409215.json
rename to data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json
index 0d4ab9c94..fe1ab40e2 100644
--- a/data/helm_classic/writer/InstructPalmyra-30B/9207fec4-d0c4-4f66-b917-f5ed57409215.json
+++ b/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/yandex/YaLM-100B/b04c8845-cccf-4856-9597-ab283bb2ec8d.json b/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json
similarity index 91%
rename from data/helm_classic/yandex/YaLM-100B/b04c8845-cccf-4856-9597-ab283bb2ec8d.json
rename to data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json
index 4b439ab57..61a019ad2 100644
--- a/data/helm_classic/yandex/YaLM-100B/b04c8845-cccf-4856-9597-ab283bb2ec8d.json
+++ b/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/yandex_YaLM-100B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/yandex_YaLM-100B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_classic/zhipu-ai/GLM-130B/4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json b/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json
similarity index 91%
rename from data/helm_classic/zhipu-ai/GLM-130B/4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json
rename to data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json
index 67e0f75ce..04bdfa490 100644
--- a/data/helm_classic/zhipu-ai/GLM-130B/4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json
+++ b/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1770830385.7573261",
-  "retrieved_timestamp": "1770830385.7573261",
+  "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1770834891.1472661",
+  "retrieved_timestamp": "1770834891.1472661",
   "source_metadata": {
     "source_name": "helm_classic",
     "source_type": "documentation",
@@ -82,7 +82,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -91,7 +91,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -159,7 +159,7 @@
       }
     },
     {
-      "evaluation_name": "BoolQ - EM",
+      "evaluation_name": "BoolQ",
       "source_data": {
         "dataset_name": "BoolQ",
         "source_type": "url",
@@ -168,7 +168,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on BoolQ",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -363,7 +363,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (open-book) - F1",
+      "evaluation_name": "NaturalQuestions (open-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (open-book)",
         "source_type": "url",
@@ -372,7 +372,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (open-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -540,7 +540,7 @@
       }
     },
     {
-      "evaluation_name": "QuAC - F1",
+      "evaluation_name": "QuAC",
       "source_data": {
         "dataset_name": "QuAC",
         "source_type": "url",
@@ -549,7 +549,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on QuAC",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -642,7 +642,7 @@
       }
     },
     {
-      "evaluation_name": "HellaSwag - EM",
+      "evaluation_name": "HellaSwag",
       "source_data": {
         "dataset_name": "HellaSwag",
         "source_type": "url",
@@ -651,7 +651,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on HellaSwag",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -719,7 +719,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -728,7 +728,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -796,7 +796,7 @@
       }
     },
     {
-      "evaluation_name": "TruthfulQA - EM",
+      "evaluation_name": "TruthfulQA",
       "source_data": {
         "dataset_name": "TruthfulQA",
         "source_type": "url",
@@ -805,7 +805,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on TruthfulQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -873,7 +873,7 @@
       }
     },
     {
-      "evaluation_name": "MS MARCO (TREC) - NDCG@10",
+      "evaluation_name": "MS MARCO (TREC)",
       "source_data": {
         "dataset_name": "MS MARCO (TREC)",
         "source_type": "url",
@@ -882,7 +882,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.",
+        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1040,7 +1040,7 @@
       }
     },
     {
-      "evaluation_name": "CNN/DailyMail - ROUGE-2",
+      "evaluation_name": "CNN/DailyMail",
       "source_data": {
         "dataset_name": "CNN/DailyMail",
         "source_type": "url",
@@ -1049,7 +1049,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1172,7 +1172,7 @@
       }
     },
     {
-      "evaluation_name": "XSUM - ROUGE-2",
+      "evaluation_name": "XSUM",
       "source_data": {
         "dataset_name": "XSUM",
         "source_type": "url",
@@ -1181,7 +1181,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.",
+        "evaluation_description": "ROUGE-2 on XSUM",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1304,7 +1304,7 @@
       }
     },
     {
-      "evaluation_name": "IMDB - EM",
+      "evaluation_name": "IMDB",
       "source_data": {
         "dataset_name": "IMDB",
         "source_type": "url",
@@ -1313,7 +1313,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on IMDB",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1406,7 +1406,7 @@
       }
     },
     {
-      "evaluation_name": "CivilComments - EM",
+      "evaluation_name": "CivilComments",
       "source_data": {
         "dataset_name": "CivilComments",
         "source_type": "url",
@@ -1415,7 +1415,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on CivilComments",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1508,7 +1508,7 @@
       }
     },
     {
-      "evaluation_name": "RAFT - EM",
+      "evaluation_name": "RAFT",
       "source_data": {
         "dataset_name": "RAFT",
         "source_type": "url",
@@ -1517,7 +1517,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on RAFT",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_instruct/anthropic/claude-v1.3/0e30e895-aaf7-42d4-95db-7541d6b41c87.json b/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json
similarity index 73%
rename from data/helm_instruct/anthropic/claude-v1.3/0e30e895-aaf7-42d4-95db-7541d6b41c87.json
rename to data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json
index abd3e5e5b..31ab229b7 100644
--- a/data/helm_instruct/anthropic/claude-v1.3/0e30e895-aaf7-42d4-95db-7541d6b41c87.json
+++ b/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1770830411.78817",
-  "retrieved_timestamp": "1770830411.78817",
+  "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1770834858.3559701",
+  "retrieved_timestamp": "1770834858.3559701",
   "source_metadata": {
     "source_name": "helm_instruct",
     "source_type": "documentation",
@@ -42,7 +42,7 @@
       }
     },
     {
-      "evaluation_name": "Anthropic RLHF dataset - Harmlessness",
+      "evaluation_name": "Anthropic RLHF dataset",
       "source_data": {
         "dataset_name": "Anthropic RLHF dataset",
         "source_type": "url",
@@ -51,7 +51,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -90,7 +90,7 @@
       }
     },
     {
-      "evaluation_name": "Best ChatGPT Prompts - Harmlessness",
+      "evaluation_name": "Best ChatGPT Prompts",
       "source_data": {
         "dataset_name": "Best ChatGPT Prompts",
         "source_type": "url",
@@ -99,7 +99,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -126,7 +126,7 @@
       }
     },
     {
-      "evaluation_name": "Koala test dataset - Harmlessness",
+      "evaluation_name": "Koala test dataset",
       "source_data": {
         "dataset_name": "Koala test dataset",
         "source_type": "url",
@@ -135,7 +135,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Koala test dataset",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -160,7 +160,7 @@
       }
     },
     {
-      "evaluation_name": "Open Assistant - Harmlessness",
+      "evaluation_name": "Open Assistant",
       "source_data": {
         "dataset_name": "Open Assistant",
         "source_type": "url",
@@ -169,7 +169,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Open Assistant",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -195,7 +195,7 @@
       }
     },
     {
-      "evaluation_name": "Self Instruct - Harmlessness",
+      "evaluation_name": "Self Instruct",
       "source_data": {
         "dataset_name": "Self Instruct",
         "source_type": "url",
@@ -204,7 +204,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Self Instruct",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -229,7 +229,7 @@
       }
     },
     {
-      "evaluation_name": "Vicuna - Harmlessness",
+      "evaluation_name": "Vicuna",
       "source_data": {
         "dataset_name": "Vicuna",
         "source_type": "url",
@@ -238,7 +238,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Vicuna",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_instruct/cohere/command-xlarge-beta/4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json b/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json
similarity index 74%
rename from data/helm_instruct/cohere/command-xlarge-beta/4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json
rename to data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json
index 3aea06a21..2fd221159 100644
--- a/data/helm_instruct/cohere/command-xlarge-beta/4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json
+++ b/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1770830411.78817",
-  "retrieved_timestamp": "1770830411.78817",
+  "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1770834858.3559701",
+  "retrieved_timestamp": "1770834858.3559701",
   "source_metadata": {
     "source_name": "helm_instruct",
     "source_type": "documentation",
@@ -42,7 +42,7 @@
       }
     },
     {
-      "evaluation_name": "Anthropic RLHF dataset - Harmlessness",
+      "evaluation_name": "Anthropic RLHF dataset",
       "source_data": {
         "dataset_name": "Anthropic RLHF dataset",
         "source_type": "url",
@@ -51,7 +51,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -90,7 +90,7 @@
       }
     },
     {
-      "evaluation_name": "Best ChatGPT Prompts - Harmlessness",
+      "evaluation_name": "Best ChatGPT Prompts",
       "source_data": {
         "dataset_name": "Best ChatGPT Prompts",
         "source_type": "url",
@@ -99,7 +99,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -126,7 +126,7 @@
       }
     },
     {
-      "evaluation_name": "Koala test dataset - Harmlessness",
+      "evaluation_name": "Koala test dataset",
       "source_data": {
         "dataset_name": "Koala test dataset",
         "source_type": "url",
@@ -135,7 +135,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Koala test dataset",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -160,7 +160,7 @@
       }
     },
     {
-      "evaluation_name": "Open Assistant - Harmlessness",
+      "evaluation_name": "Open Assistant",
       "source_data": {
         "dataset_name": "Open Assistant",
         "source_type": "url",
@@ -169,7 +169,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Open Assistant",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -195,7 +195,7 @@
       }
     },
     {
-      "evaluation_name": "Self Instruct - Harmlessness",
+      "evaluation_name": "Self Instruct",
       "source_data": {
         "dataset_name": "Self Instruct",
         "source_type": "url",
@@ -204,7 +204,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Self Instruct",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -229,7 +229,7 @@
       }
     },
     {
-      "evaluation_name": "Vicuna - Harmlessness",
+      "evaluation_name": "Vicuna",
       "source_data": {
         "dataset_name": "Vicuna",
         "source_type": "url",
@@ -238,7 +238,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Vicuna",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_instruct/openai/gpt-3.5-turbo-0613/8befd29c-a16d-4e05-a92f-00b621d45e03.json b/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json
similarity index 74%
rename from data/helm_instruct/openai/gpt-3.5-turbo-0613/8befd29c-a16d-4e05-a92f-00b621d45e03.json
rename to data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json
index 31fd0891a..23dfc4397 100644
--- a/data/helm_instruct/openai/gpt-3.5-turbo-0613/8befd29c-a16d-4e05-a92f-00b621d45e03.json
+++ b/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1770830411.78817",
-  "retrieved_timestamp": "1770830411.78817",
+  "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1770834858.3559701",
+  "retrieved_timestamp": "1770834858.3559701",
   "source_metadata": {
     "source_name": "helm_instruct",
     "source_type": "documentation",
@@ -42,7 +42,7 @@
       }
     },
     {
-      "evaluation_name": "Anthropic RLHF dataset - Harmlessness",
+      "evaluation_name": "Anthropic RLHF dataset",
       "source_data": {
         "dataset_name": "Anthropic RLHF dataset",
         "source_type": "url",
@@ -51,7 +51,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -90,7 +90,7 @@
       }
     },
     {
-      "evaluation_name": "Best ChatGPT Prompts - Harmlessness",
+      "evaluation_name": "Best ChatGPT Prompts",
       "source_data": {
         "dataset_name": "Best ChatGPT Prompts",
         "source_type": "url",
@@ -99,7 +99,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -126,7 +126,7 @@
       }
     },
     {
-      "evaluation_name": "Koala test dataset - Harmlessness",
+      "evaluation_name": "Koala test dataset",
       "source_data": {
         "dataset_name": "Koala test dataset",
         "source_type": "url",
@@ -135,7 +135,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Koala test dataset",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -160,7 +160,7 @@
       }
     },
     {
-      "evaluation_name": "Open Assistant - Harmlessness",
+      "evaluation_name": "Open Assistant",
       "source_data": {
         "dataset_name": "Open Assistant",
         "source_type": "url",
@@ -169,7 +169,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Open Assistant",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -195,7 +195,7 @@
       }
     },
     {
-      "evaluation_name": "Self Instruct - Harmlessness",
+      "evaluation_name": "Self Instruct",
       "source_data": {
         "dataset_name": "Self Instruct",
         "source_type": "url",
@@ -204,7 +204,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Self Instruct",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -229,7 +229,7 @@
       }
     },
     {
-      "evaluation_name": "Vicuna - Harmlessness",
+      "evaluation_name": "Vicuna",
       "source_data": {
         "dataset_name": "Vicuna",
         "source_type": "url",
@@ -238,7 +238,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Vicuna",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_instruct/openai/gpt-4-0314/b2e193b8-215b-4e80-9d5a-df11f1dac88a.json b/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json
similarity index 73%
rename from data/helm_instruct/openai/gpt-4-0314/b2e193b8-215b-4e80-9d5a-df11f1dac88a.json
rename to data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json
index ac8e25cb0..9ad1bca2e 100644
--- a/data/helm_instruct/openai/gpt-4-0314/b2e193b8-215b-4e80-9d5a-df11f1dac88a.json
+++ b/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_instruct/openai_gpt-4-0314/1770830411.78817",
-  "retrieved_timestamp": "1770830411.78817",
+  "evaluation_id": "helm_instruct/openai_gpt-4-0314/1770834858.3559701",
+  "retrieved_timestamp": "1770834858.3559701",
   "source_metadata": {
     "source_name": "helm_instruct",
     "source_type": "documentation",
@@ -42,7 +42,7 @@
       }
     },
     {
-      "evaluation_name": "Anthropic RLHF dataset - Harmlessness",
+      "evaluation_name": "Anthropic RLHF dataset",
       "source_data": {
         "dataset_name": "Anthropic RLHF dataset",
         "source_type": "url",
@@ -51,7 +51,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -90,7 +90,7 @@
       }
     },
     {
-      "evaluation_name": "Best ChatGPT Prompts - Harmlessness",
+      "evaluation_name": "Best ChatGPT Prompts",
       "source_data": {
         "dataset_name": "Best ChatGPT Prompts",
         "source_type": "url",
@@ -99,7 +99,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -126,7 +126,7 @@
       }
     },
     {
-      "evaluation_name": "Koala test dataset - Harmlessness",
+      "evaluation_name": "Koala test dataset",
       "source_data": {
         "dataset_name": "Koala test dataset",
         "source_type": "url",
@@ -135,7 +135,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Koala test dataset",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -160,7 +160,7 @@
       }
     },
     {
-      "evaluation_name": "Open Assistant - Harmlessness",
+      "evaluation_name": "Open Assistant",
       "source_data": {
         "dataset_name": "Open Assistant",
         "source_type": "url",
@@ -169,7 +169,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Open Assistant",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -195,7 +195,7 @@
       }
     },
     {
-      "evaluation_name": "Self Instruct - Harmlessness",
+      "evaluation_name": "Self Instruct",
       "source_data": {
         "dataset_name": "Self Instruct",
         "source_type": "url",
@@ -204,7 +204,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Self Instruct",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -229,7 +229,7 @@
       }
     },
     {
-      "evaluation_name": "Vicuna - Harmlessness",
+      "evaluation_name": "Vicuna",
       "source_data": {
         "dataset_name": "Vicuna",
         "source_type": "url",
@@ -238,7 +238,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.",
+        "evaluation_description": "Harmlessness on Vicuna",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/01-ai/yi-34b/eedd0f38-6d26-4297-a469-291227ec6be6.json b/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json
similarity index 85%
rename from data/helm_lite/01-ai/yi-34b/eedd0f38-6d26-4297-a469-291227ec6be6.json
rename to data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json
index 96c3d4d2d..946b7db3e 100644
--- a/data/helm_lite/01-ai/yi-34b/eedd0f38-6d26-4297-a469-291227ec6be6.json
+++ b/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/01-ai_yi-34b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/01-ai_yi-34b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/01-ai/yi-6b/74c47665-740f-4784-8a27-1c1d1c29bff8.json b/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json
similarity index 85%
rename from data/helm_lite/01-ai/yi-6b/74c47665-740f-4784-8a27-1c1d1c29bff8.json
rename to data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json
index 497d98a06..28ba5fb69 100644
--- a/data/helm_lite/01-ai/yi-6b/74c47665-740f-4784-8a27-1c1d1c29bff8.json
+++ b/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/01-ai_yi-6b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/01-ai_yi-6b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/01-ai/yi-large-preview/8027b577-7f48-4df5-9879-bd45ac342f42.json b/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json
similarity index 85%
rename from data/helm_lite/01-ai/yi-large-preview/8027b577-7f48-4df5-9879-bd45ac342f42.json
rename to data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json
index 7bea38ffb..9fe678bb4 100644
--- a/data/helm_lite/01-ai/yi-large-preview/8027b577-7f48-4df5-9879-bd45ac342f42.json
+++ b/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/01-ai_yi-large-preview/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/01-ai_yi-large-preview/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/AlephAlpha/luminous-base/e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json b/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json
similarity index 85%
rename from data/helm_lite/AlephAlpha/luminous-base/e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json
rename to data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json
index 00a6f037c..fb405652b 100644
--- a/data/helm_lite/AlephAlpha/luminous-base/e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json
+++ b/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/AlephAlpha/luminous-extended/24e11e7b-15d6-4a09-9545-38486d0eb236.json b/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json
similarity index 86%
rename from data/helm_lite/AlephAlpha/luminous-extended/24e11e7b-15d6-4a09-9545-38486d0eb236.json
rename to data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json
index 215983cef..786a7e340 100644
--- a/data/helm_lite/AlephAlpha/luminous-extended/24e11e7b-15d6-4a09-9545-38486d0eb236.json
+++ b/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/AlephAlpha/luminous-supreme/eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json b/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json
similarity index 85%
rename from data/helm_lite/AlephAlpha/luminous-supreme/eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json
rename to data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json
index 81f487c09..78da47969 100644
--- a/data/helm_lite/AlephAlpha/luminous-supreme/eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json
+++ b/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/ai21/j2-grande/52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json b/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json
similarity index 85%
rename from data/helm_lite/ai21/j2-grande/52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json
rename to data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json
index ef3567598..2b870e958 100644
--- a/data/helm_lite/ai21/j2-grande/52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json
+++ b/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/ai21_j2-grande/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/ai21_j2-grande/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/ai21/j2-jumbo/68713712-ae92-474b-84c0-1b8301538439.json b/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json
similarity index 85%
rename from data/helm_lite/ai21/j2-jumbo/68713712-ae92-474b-84c0-1b8301538439.json
rename to data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json
index f39f9c93e..643b24001 100644
--- a/data/helm_lite/ai21/j2-jumbo/68713712-ae92-474b-84c0-1b8301538439.json
+++ b/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/ai21_j2-jumbo/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/ai21_j2-jumbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/ai21/jamba-1.5-large/15cc9411-6ea4-4f10-831f-23ff27fd5704.json b/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json
similarity index 85%
rename from data/helm_lite/ai21/jamba-1.5-large/15cc9411-6ea4-4f10-831f-23ff27fd5704.json
rename to data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json
index d7dee0e9a..a07da123a 100644
--- a/data/helm_lite/ai21/jamba-1.5-large/15cc9411-6ea4-4f10-831f-23ff27fd5704.json
+++ b/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/ai21/jamba-1.5-mini/3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json b/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json
similarity index 85%
rename from data/helm_lite/ai21/jamba-1.5-mini/3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json
rename to data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json
index f65e65120..9e0628c9d 100644
--- a/data/helm_lite/ai21/jamba-1.5-mini/3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json
+++ b/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/ai21/jamba-instruct/1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json b/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json
similarity index 85%
rename from data/helm_lite/ai21/jamba-instruct/1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json
rename to data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json
index a3e5bda34..9e1241a8e 100644
--- a/data/helm_lite/ai21/jamba-instruct/1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json
+++ b/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/ai21_jamba-instruct/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/ai21_jamba-instruct/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/allenai/olmo-7b/078d812b-2198-4497-8fbe-06fb640fd86d.json b/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json
similarity index 85%
rename from data/helm_lite/allenai/olmo-7b/078d812b-2198-4497-8fbe-06fb640fd86d.json
rename to data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json
index 51375c00c..b68794dd1 100644
--- a/data/helm_lite/allenai/olmo-7b/078d812b-2198-4497-8fbe-06fb640fd86d.json
+++ b/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/allenai_olmo-7b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/allenai_olmo-7b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/amazon/nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json b/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json
similarity index 85%
rename from data/helm_lite/amazon/nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json
rename to data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json
index 289dc9306..084734ba7 100644
--- a/data/helm_lite/amazon/nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json
+++ b/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -519,7 +519,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -528,7 +528,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -576,7 +576,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -585,7 +585,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/amazon/nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json b/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json
similarity index 85%
rename from data/helm_lite/amazon/nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json
rename to data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json
index bcd94c63d..fb66c7744 100644
--- a/data/helm_lite/amazon/nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json
+++ b/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -519,7 +519,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -528,7 +528,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -576,7 +576,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -585,7 +585,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/amazon/nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json b/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json
similarity index 85%
rename from data/helm_lite/amazon/nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json
rename to data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json
index c8589b186..c7f9d86e2 100644
--- a/data/helm_lite/amazon/nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json
+++ b/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -519,7 +519,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -528,7 +528,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -576,7 +576,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -585,7 +585,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/anthropic/claude-2.0/0684c1d2-ea43-4341-820c-09051f5e11f2.json b/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json
similarity index 85%
rename from data/helm_lite/anthropic/claude-2.0/0684c1d2-ea43-4341-820c-09051f5e11f2.json
rename to data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json
index 2f9d0f3e2..ab0989b58 100644
--- a/data/helm_lite/anthropic/claude-2.0/0684c1d2-ea43-4341-820c-09051f5e11f2.json
+++ b/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-2.0/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/anthropic_claude-2.0/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/anthropic/claude-2.1/51821ca1-7eac-4094-abac-98b2484cc5a0.json b/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json
similarity index 85%
rename from data/helm_lite/anthropic/claude-2.1/51821ca1-7eac-4094-abac-98b2484cc5a0.json
rename to data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json
index fe9f851b2..2adbb62af 100644
--- a/data/helm_lite/anthropic/claude-2.1/51821ca1-7eac-4094-abac-98b2484cc5a0.json
+++ b/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-2.1/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/anthropic_claude-2.1/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/8a0f5749-7f6a-4813-9c08-7283433c1337.json b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json
similarity index 86%
rename from data/helm_lite/anthropic/claude-3-5-haiku-20241022/8a0f5749-7f6a-4813-9c08-7283433c1337.json
rename to data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json
index 9eecf8a25..ff757a7ad 100644
--- a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/8a0f5749-7f6a-4813-9c08-7283433c1337.json
+++ b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -519,7 +519,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -528,7 +528,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -576,7 +576,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -585,7 +585,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/4697983d-a29a-484d-9268-7974117456e8.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json
similarity index 86%
rename from data/helm_lite/anthropic/claude-3-5-sonnet-20240620/4697983d-a29a-484d-9268-7974117456e8.json
rename to data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json
index f3aab2968..2c4b0d7d1 100644
--- a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/4697983d-a29a-484d-9268-7974117456e8.json
+++ b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/60e33aa3-0593-42e6-9baa-8311746deca0.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json
similarity index 86%
rename from data/helm_lite/anthropic/claude-3-5-sonnet-20241022/60e33aa3-0593-42e6-9baa-8311746deca0.json
rename to data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json
index 6a814b17d..4b9824f13 100644
--- a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/60e33aa3-0593-42e6-9baa-8311746deca0.json
+++ b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/anthropic/claude-3-haiku-20240307/2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json b/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json
similarity index 86%
rename from data/helm_lite/anthropic/claude-3-haiku-20240307/2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json
rename to data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json
index 54328bd79..8eac62865 100644
--- a/data/helm_lite/anthropic/claude-3-haiku-20240307/2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json
+++ b/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/anthropic/claude-3-opus-20240229/9ad91ee2-7a64-4f94-9166-f2681777023b.json b/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json
similarity index 86%
rename from data/helm_lite/anthropic/claude-3-opus-20240229/9ad91ee2-7a64-4f94-9166-f2681777023b.json
rename to data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json
index ad60ccaa3..d590c786e 100644
--- a/data/helm_lite/anthropic/claude-3-opus-20240229/9ad91ee2-7a64-4f94-9166-f2681777023b.json
+++ b/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/anthropic/claude-3-sonnet-20240229/4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json b/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json
similarity index 86%
rename from data/helm_lite/anthropic/claude-3-sonnet-20240229/4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json
rename to data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json
index 35374c2f9..90baddbf7 100644
--- a/data/helm_lite/anthropic/claude-3-sonnet-20240229/4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json
+++ b/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/anthropic/claude-instant-1.2/64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json b/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json
similarity index 86%
rename from data/helm_lite/anthropic/claude-instant-1.2/64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json
rename to data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json
index 7dbf7e9ee..c3ca60cb8 100644
--- a/data/helm_lite/anthropic/claude-instant-1.2/64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json
+++ b/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/anthropic/claude-v1.3/fe8a36b0-4361-461b-b310-656c54131fa6.json b/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json
similarity index 85%
rename from data/helm_lite/anthropic/claude-v1.3/fe8a36b0-4361-461b-b310-656c54131fa6.json
rename to data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json
index 04da077b3..da3e6b3b3 100644
--- a/data/helm_lite/anthropic/claude-v1.3/fe8a36b0-4361-461b-b310-656c54131fa6.json
+++ b/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-v1.3/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/anthropic_claude-v1.3/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/cohere/command-light/b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json b/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json
similarity index 85%
rename from data/helm_lite/cohere/command-light/b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json
rename to data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json
index b4ccf63fd..a431f3338 100644
--- a/data/helm_lite/cohere/command-light/b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json
+++ b/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/cohere_command-light/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/cohere_command-light/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/cohere/command-r-plus/67967a2a-5fb4-46e8-b1ec-eda1588d9086.json b/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json
similarity index 85%
rename from data/helm_lite/cohere/command-r-plus/67967a2a-5fb4-46e8-b1ec-eda1588d9086.json
rename to data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json
index e941df44c..d0f464767 100644
--- a/data/helm_lite/cohere/command-r-plus/67967a2a-5fb4-46e8-b1ec-eda1588d9086.json
+++ b/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/cohere_command-r-plus/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/cohere_command-r-plus/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/cohere/command-r/0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json b/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json
similarity index 85%
rename from data/helm_lite/cohere/command-r/0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json
rename to data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json
index 2314d1d0d..51821d155 100644
--- a/data/helm_lite/cohere/command-r/0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json
+++ b/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/cohere_command-r/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/cohere_command-r/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/cohere/command/ba5eea81-2120-4a20-8322-dfbd29cd197c.json b/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json
similarity index 85%
rename from data/helm_lite/cohere/command/ba5eea81-2120-4a20-8322-dfbd29cd197c.json
rename to data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json
index 95909d3aa..488fa54b9 100644
--- a/data/helm_lite/cohere/command/ba5eea81-2120-4a20-8322-dfbd29cd197c.json
+++ b/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/cohere_command/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/cohere_command/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/databricks/dbrx-instruct/9dd66ede-da5c-4627-92ed-7057c9a2bea3.json b/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json
similarity index 85%
rename from data/helm_lite/databricks/dbrx-instruct/9dd66ede-da5c-4627-92ed-7057c9a2bea3.json
rename to data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json
index 81dc83db8..9dc0aa32d 100644
--- a/data/helm_lite/databricks/dbrx-instruct/9dd66ede-da5c-4627-92ed-7057c9a2bea3.json
+++ b/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/databricks_dbrx-instruct/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/databricks_dbrx-instruct/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/801aa7da-90b2-48d1-ad3d-943b06bd437c.json b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json
similarity index 85%
rename from data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/801aa7da-90b2-48d1-ad3d-943b06bd437c.json
rename to data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json
index 31cee265a..201ddf6e5 100644
--- a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/801aa7da-90b2-48d1-ad3d-943b06bd437c.json
+++ b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/deepseek-ai/deepseek-v3/a58923ea-fa22-4c45-8327-efbe84c8a05d.json b/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json
similarity index 85%
rename from data/helm_lite/deepseek-ai/deepseek-v3/a58923ea-fa22-4c45-8327-efbe84c8a05d.json
rename to data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json
index cc64f30ee..b5f8e240f 100644
--- a/data/helm_lite/deepseek-ai/deepseek-v3/a58923ea-fa22-4c45-8327-efbe84c8a05d.json
+++ b/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/google/gemini-1.0-pro-002/bab8d241-fad0-4230-b213-c2eeccc79f12.json b/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json
similarity index 85%
rename from data/helm_lite/google/gemini-1.0-pro-002/bab8d241-fad0-4230-b213-c2eeccc79f12.json
rename to data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json
index f6af740ee..eabdc0bbd 100644
--- a/data/helm_lite/google/gemini-1.0-pro-002/bab8d241-fad0-4230-b213-c2eeccc79f12.json
+++ b/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/google/gemini-1.5-flash-001/65e37589-ef26-46cd-a627-798af70e75bf.json b/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json
similarity index 85%
rename from data/helm_lite/google/gemini-1.5-flash-001/65e37589-ef26-46cd-a627-798af70e75bf.json
rename to data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json
index 7c312bb83..991b81669 100644
--- a/data/helm_lite/google/gemini-1.5-flash-001/65e37589-ef26-46cd-a627-798af70e75bf.json
+++ b/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/google/gemini-1.5-flash-002/f499f9c6-4c9a-43ba-b4c3-d094494a371c.json b/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json
similarity index 85%
rename from data/helm_lite/google/gemini-1.5-flash-002/f499f9c6-4c9a-43ba-b4c3-d094494a371c.json
rename to data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json
index 450dbafcb..725c639a2 100644
--- a/data/helm_lite/google/gemini-1.5-flash-002/f499f9c6-4c9a-43ba-b4c3-d094494a371c.json
+++ b/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/google/gemini-1.5-pro-001/27a54446-57b2-4239-b768-7ab85dc94c54.json b/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json
similarity index 85%
rename from data/helm_lite/google/gemini-1.5-pro-001/27a54446-57b2-4239-b768-7ab85dc94c54.json
rename to data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json
index 653e006ee..8b7eab026 100644
--- a/data/helm_lite/google/gemini-1.5-pro-001/27a54446-57b2-4239-b768-7ab85dc94c54.json
+++ b/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/google/gemini-1.5-pro-002/5de8a13e-a029-4a90-9a2d-c28a59212140.json b/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json
similarity index 85%
rename from data/helm_lite/google/gemini-1.5-pro-002/5de8a13e-a029-4a90-9a2d-c28a59212140.json
rename to data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json
index 64f712478..ebd3081fb 100644
--- a/data/helm_lite/google/gemini-1.5-pro-002/5de8a13e-a029-4a90-9a2d-c28a59212140.json
+++ b/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/google/gemini-2.0-flash-exp/f9643ce2-7347-401b-903e-fadcc5221f36.json b/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json
similarity index 85%
rename from data/helm_lite/google/gemini-2.0-flash-exp/f9643ce2-7347-401b-903e-fadcc5221f36.json
rename to data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json
index 371c57f27..b96b71c0c 100644
--- a/data/helm_lite/google/gemini-2.0-flash-exp/f9643ce2-7347-401b-903e-fadcc5221f36.json
+++ b/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -519,7 +519,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -528,7 +528,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -576,7 +576,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -585,7 +585,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/google/gemma-2-27b-it/9932e430-2039-40b0-bc8f-ae2d833543e8.json b/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json
similarity index 85%
rename from data/helm_lite/google/gemma-2-27b-it/9932e430-2039-40b0-bc8f-ae2d833543e8.json
rename to data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json
index 24f598da3..ea107cc9e 100644
--- a/data/helm_lite/google/gemma-2-27b-it/9932e430-2039-40b0-bc8f-ae2d833543e8.json
+++ b/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemma-2-27b-it/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/google_gemma-2-27b-it/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/google/gemma-2-9b-it/dbd2e9bb-c2ca-4165-b229-d736a70721a5.json b/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json
similarity index 85%
rename from data/helm_lite/google/gemma-2-9b-it/dbd2e9bb-c2ca-4165-b229-d736a70721a5.json
rename to data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json
index 1e65ff610..1488d6604 100644
--- a/data/helm_lite/google/gemma-2-9b-it/dbd2e9bb-c2ca-4165-b229-d736a70721a5.json
+++ b/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemma-2-9b-it/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/google_gemma-2-9b-it/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/google/gemma-7b/32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json b/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json
similarity index 85%
rename from data/helm_lite/google/gemma-7b/32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json
rename to data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json
index ee614ce44..810e32965 100644
--- a/data/helm_lite/google/gemma-7b/32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json
+++ b/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemma-7b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/google_gemma-7b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/google/text-bison@001/70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json b/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json
similarity index 85%
rename from data/helm_lite/google/text-bison@001/70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json
rename to data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json
index 7d7c944f0..30d0e3442 100644
--- a/data/helm_lite/google/text-bison@001/70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json
+++ b/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_text-bison@001/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/google_text-bison@001/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/google/text-unicorn@001/07a367ee-2879-4ede-bbf8-33b24d682467.json b/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json
similarity index 85%
rename from data/helm_lite/google/text-unicorn@001/07a367ee-2879-4ede-bbf8-33b24d682467.json
rename to data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json
index f19d99b14..d5841340f 100644
--- a/data/helm_lite/google/text-unicorn@001/07a367ee-2879-4ede-bbf8-33b24d682467.json
+++ b/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_text-unicorn@001/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/google_text-unicorn@001/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/meta/llama-2-13b/fee914c7-d6bf-4d61-9f50-71bae5f11006.json b/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json
similarity index 85%
rename from data/helm_lite/meta/llama-2-13b/fee914c7-d6bf-4d61-9f50-71bae5f11006.json
rename to data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json
index f38e87995..079c14180 100644
--- a/data/helm_lite/meta/llama-2-13b/fee914c7-d6bf-4d61-9f50-71bae5f11006.json
+++ b/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-2-13b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/meta_llama-2-13b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/meta/llama-2-70b/b0577066-231e-461b-bae8-b724b204397a.json b/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json
similarity index 85%
rename from data/helm_lite/meta/llama-2-70b/b0577066-231e-461b-bae8-b724b204397a.json
rename to data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json
index b0d616c29..8faa07285 100644
--- a/data/helm_lite/meta/llama-2-70b/b0577066-231e-461b-bae8-b724b204397a.json
+++ b/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-2-70b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/meta_llama-2-70b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/meta/llama-2-7b/b79fe2e3-5eec-46f8-90a1-810781c8c46a.json b/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json
similarity index 85%
rename from data/helm_lite/meta/llama-2-7b/b79fe2e3-5eec-46f8-90a1-810781c8c46a.json
rename to data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json
index 1e9aac924..bb2c02730 100644
--- a/data/helm_lite/meta/llama-2-7b/b79fe2e3-5eec-46f8-90a1-810781c8c46a.json
+++ b/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-2-7b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/meta_llama-2-7b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/meta/llama-3-70b/998616ef-5d1b-4c65-b6ad-23afc3630d5a.json b/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json
similarity index 85%
rename from data/helm_lite/meta/llama-3-70b/998616ef-5d1b-4c65-b6ad-23afc3630d5a.json
rename to data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json
index f13ee8122..876850010 100644
--- a/data/helm_lite/meta/llama-3-70b/998616ef-5d1b-4c65-b6ad-23afc3630d5a.json
+++ b/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3-70b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/meta_llama-3-70b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/meta/llama-3-8b/fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json b/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json
similarity index 85%
rename from data/helm_lite/meta/llama-3-8b/fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json
rename to data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json
index 7f18bf5fd..87ab72524 100644
--- a/data/helm_lite/meta/llama-3-8b/fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json
+++ b/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3-8b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/meta_llama-3-8b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/25fde5e6-86b8-4a80-8f79-5946ef9999fc.json b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json
similarity index 86%
rename from data/helm_lite/meta/llama-3.1-405b-instruct-turbo/25fde5e6-86b8-4a80-8f79-5946ef9999fc.json
rename to data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json
index 6ef4300e5..0bc6225d5 100644
--- a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/25fde5e6-86b8-4a80-8f79-5946ef9999fc.json
+++ b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/b955825d-ae7f-48c4-9dad-5ee78879737d.json b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json
similarity index 86%
rename from data/helm_lite/meta/llama-3.1-70b-instruct-turbo/b955825d-ae7f-48c4-9dad-5ee78879737d.json
rename to data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json
index 8afc05c39..d57074cb2 100644
--- a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/b955825d-ae7f-48c4-9dad-5ee78879737d.json
+++ b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json
similarity index 86%
rename from data/helm_lite/meta/llama-3.1-8b-instruct-turbo/168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json
rename to data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json
index e5cc6d55a..198d81cd2 100644
--- a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json
+++ b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/0807e353-9787-4ca0-8f7b-50d1bed2469e.json b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json
similarity index 85%
rename from data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/0807e353-9787-4ca0-8f7b-50d1bed2469e.json
rename to data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json
index 793304d91..722a6f050 100644
--- a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/0807e353-9787-4ca0-8f7b-50d1bed2469e.json
+++ b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/0164b885-2c27-4eba-8e6f-e69156cb0dee.json b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json
similarity index 85%
rename from data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/0164b885-2c27-4eba-8e6f-e69156cb0dee.json
rename to data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json
index 615526ba3..8bef7c4e9 100644
--- a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/0164b885-2c27-4eba-8e6f-e69156cb0dee.json
+++ b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/08422837-51a0-45c9-9004-fc5d98dce462.json b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json
similarity index 86%
rename from data/helm_lite/meta/llama-3.3-70b-instruct-turbo/08422837-51a0-45c9-9004-fc5d98dce462.json
rename to data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json
index 8116db8ba..cc4cca983 100644
--- a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/08422837-51a0-45c9-9004-fc5d98dce462.json
+++ b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/meta/llama-65b/39f2c7f2-56d4-4349-95ae-374d34263f48.json b/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json
similarity index 85%
rename from data/helm_lite/meta/llama-65b/39f2c7f2-56d4-4349-95ae-374d34263f48.json
rename to data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json
index f3354af37..ebea32b6c 100644
--- a/data/helm_lite/meta/llama-65b/39f2c7f2-56d4-4349-95ae-374d34263f48.json
+++ b/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-65b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/meta_llama-65b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/microsoft/phi-2/0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json b/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json
similarity index 85%
rename from data/helm_lite/microsoft/phi-2/0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json
rename to data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json
index 172c44cac..ee330c2d2 100644
--- a/data/helm_lite/microsoft/phi-2/0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json
+++ b/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/microsoft_phi-2/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/microsoft_phi-2/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json
similarity index 86%
rename from data/helm_lite/microsoft/phi-3-medium-4k-instruct/75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json
rename to data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json
index c613f7fec..6d945026f 100644
--- a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json
+++ b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/microsoft/phi-3-small-8k-instruct/2de4b89a-3f3b-4d1d-ba85-030953a46956.json b/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json
similarity index 85%
rename from data/helm_lite/microsoft/phi-3-small-8k-instruct/2de4b89a-3f3b-4d1d-ba85-030953a46956.json
rename to data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json
index f78b3f049..c7b88764b 100644
--- a/data/helm_lite/microsoft/phi-3-small-8k-instruct/2de4b89a-3f3b-4d1d-ba85-030953a46956.json
+++ b/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/bd68405f-fe9a-448b-9c80-468c656594e5.json b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json
similarity index 86%
rename from data/helm_lite/mistralai/mistral-7b-instruct-v0.3/bd68405f-fe9a-448b-9c80-468c656594e5.json
rename to data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json
index 97f8b3a1e..fd0f8e02b 100644
--- a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/bd68405f-fe9a-448b-9c80-468c656594e5.json
+++ b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/mistralai/mistral-7b-v0.1/4267fef1-3180-46e3-990e-0d1092ec4c18.json b/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json
similarity index 85%
rename from data/helm_lite/mistralai/mistral-7b-v0.1/4267fef1-3180-46e3-990e-0d1092ec4c18.json
rename to data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json
index 30337d5a4..8f4801f23 100644
--- a/data/helm_lite/mistralai/mistral-7b-v0.1/4267fef1-3180-46e3-990e-0d1092ec4c18.json
+++ b/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/mistralai/mistral-large-2402/002a34dc-39e5-451d-b2a8-b51bdb69a056.json b/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json
similarity index 86%
rename from data/helm_lite/mistralai/mistral-large-2402/002a34dc-39e5-451d-b2a8-b51bdb69a056.json
rename to data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json
index edea4050d..d8d60cc37 100644
--- a/data/helm_lite/mistralai/mistral-large-2402/002a34dc-39e5-451d-b2a8-b51bdb69a056.json
+++ b/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/mistralai/mistral-large-2407/5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json b/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json
similarity index 86%
rename from data/helm_lite/mistralai/mistral-large-2407/5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json
rename to data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json
index d2dd06c67..d75c9932b 100644
--- a/data/helm_lite/mistralai/mistral-large-2407/5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json
+++ b/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/mistralai/mistral-medium-2312/ad2beded-cec3-4b47-b8de-a32a3225fa66.json b/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json
similarity index 86%
rename from data/helm_lite/mistralai/mistral-medium-2312/ad2beded-cec3-4b47-b8de-a32a3225fa66.json
rename to data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json
index cbbf76044..6bb7115e2 100644
--- a/data/helm_lite/mistralai/mistral-medium-2312/ad2beded-cec3-4b47-b8de-a32a3225fa66.json
+++ b/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/mistralai/mistral-small-2402/eb901347-fc1f-4d8f-a70a-05a83e16658d.json b/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json
similarity index 86%
rename from data/helm_lite/mistralai/mistral-small-2402/eb901347-fc1f-4d8f-a70a-05a83e16658d.json
rename to data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json
index d1c6bf6d0..1f2cb2632 100644
--- a/data/helm_lite/mistralai/mistral-small-2402/eb901347-fc1f-4d8f-a70a-05a83e16658d.json
+++ b/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/mistralai/mixtral-8x22b/9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json b/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json
similarity index 85%
rename from data/helm_lite/mistralai/mixtral-8x22b/9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json
rename to data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json
index d020ec2ef..e6bfd0332 100644
--- a/data/helm_lite/mistralai/mixtral-8x22b/9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json
+++ b/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json
similarity index 86%
rename from data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json
rename to data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json
index ca92e5358..7bf0323b1 100644
--- a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json
+++ b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/mistralai/open-mistral-nemo-2407/d2d48e4a-0484-4f44-8108-2e689d7ca695.json b/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json
similarity index 86%
rename from data/helm_lite/mistralai/open-mistral-nemo-2407/d2d48e4a-0484-4f44-8108-2e689d7ca695.json
rename to data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json
index 75b65c3cd..7fee5cb57 100644
--- a/data/helm_lite/mistralai/open-mistral-nemo-2407/d2d48e4a-0484-4f44-8108-2e689d7ca695.json
+++ b/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/openai/gpt-3.5-turbo-0613/e54ae605-a91d-47d7-a08d-67bd0ea5c606.json b/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json
similarity index 85%
rename from data/helm_lite/openai/gpt-3.5-turbo-0613/e54ae605-a91d-47d7-a08d-67bd0ea5c606.json
rename to data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json
index c3db66d6f..878d33981 100644
--- a/data/helm_lite/openai/gpt-3.5-turbo-0613/e54ae605-a91d-47d7-a08d-67bd0ea5c606.json
+++ b/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/openai/gpt-4-0613/15dccf75-871d-457b-8495-e0d03d550360.json b/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json
similarity index 85%
rename from data/helm_lite/openai/gpt-4-0613/15dccf75-871d-457b-8495-e0d03d550360.json
rename to data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json
index 3b34bbe84..7ff111f74 100644
--- a/data/helm_lite/openai/gpt-4-0613/15dccf75-871d-457b-8495-e0d03d550360.json
+++ b/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-4-0613/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/openai_gpt-4-0613/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/openai/gpt-4-1106-preview/18fe5d30-bf36-405a-819e-1ecabda327ea.json b/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json
similarity index 85%
rename from data/helm_lite/openai/gpt-4-1106-preview/18fe5d30-bf36-405a-819e-1ecabda327ea.json
rename to data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json
index f80298de5..060ab8fb5 100644
--- a/data/helm_lite/openai/gpt-4-1106-preview/18fe5d30-bf36-405a-819e-1ecabda327ea.json
+++ b/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json
similarity index 86%
rename from data/helm_lite/openai/gpt-4-turbo-2024-04-09/cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json
rename to data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json
index 49bdd419a..dae83b652 100644
--- a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json
+++ b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/openai/gpt-4o-2024-05-13/cd199905-04a4-4745-b848-4f7bde97ca17.json b/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json
similarity index 85%
rename from data/helm_lite/openai/gpt-4o-2024-05-13/cd199905-04a4-4745-b848-4f7bde97ca17.json
rename to data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json
index ab2f778b6..c23053f17 100644
--- a/data/helm_lite/openai/gpt-4o-2024-05-13/cd199905-04a4-4745-b848-4f7bde97ca17.json
+++ b/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/openai/gpt-4o-2024-08-06/1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json b/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json
similarity index 85%
rename from data/helm_lite/openai/gpt-4o-2024-08-06/1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json
rename to data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json
index 3d286d830..f8d7c3614 100644
--- a/data/helm_lite/openai/gpt-4o-2024-08-06/1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json
+++ b/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bfd70aff-bf45-4f55-b730-4924afc181cd.json b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json
similarity index 86%
rename from data/helm_lite/openai/gpt-4o-mini-2024-07-18/bfd70aff-bf45-4f55-b730-4924afc181cd.json
rename to data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json
index 53ecaa7dc..3869cb246 100644
--- a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bfd70aff-bf45-4f55-b730-4924afc181cd.json
+++ b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/openai/text-davinci-002/b6e08679-1bd7-42a1-9eee-98252de2c7c1.json b/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json
similarity index 85%
rename from data/helm_lite/openai/text-davinci-002/b6e08679-1bd7-42a1-9eee-98252de2c7c1.json
rename to data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json
index c90d2c5a2..f3294dd85 100644
--- a/data/helm_lite/openai/text-davinci-002/b6e08679-1bd7-42a1-9eee-98252de2c7c1.json
+++ b/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_text-davinci-002/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/openai_text-davinci-002/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/openai/text-davinci-003/22b411d5-a314-4b17-a9c7-c1af7ca7df33.json b/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json
similarity index 85%
rename from data/helm_lite/openai/text-davinci-003/22b411d5-a314-4b17-a9c7-c1af7ca7df33.json
rename to data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json
index 6f2c648e1..93f27df2b 100644
--- a/data/helm_lite/openai/text-davinci-003/22b411d5-a314-4b17-a9c7-c1af7ca7df33.json
+++ b/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_text-davinci-003/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/openai_text-davinci-003/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/qwen/qwen1.5-110b-chat/f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json b/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json
similarity index 85%
rename from data/helm_lite/qwen/qwen1.5-110b-chat/f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json
rename to data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json
index 3b85e6b08..800f57826 100644
--- a/data/helm_lite/qwen/qwen1.5-110b-chat/f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json
+++ b/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/qwen/qwen1.5-14b/fb1bb023-16f6-4914-889b-6458d7ab1277.json b/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json
similarity index 85%
rename from data/helm_lite/qwen/qwen1.5-14b/fb1bb023-16f6-4914-889b-6458d7ab1277.json
rename to data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json
index 4df79c00f..c8749e5f5 100644
--- a/data/helm_lite/qwen/qwen1.5-14b/fb1bb023-16f6-4914-889b-6458d7ab1277.json
+++ b/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/qwen/qwen1.5-32b/8b572c10-3553-4e51-a321-bdb05996914b.json b/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json
similarity index 85%
rename from data/helm_lite/qwen/qwen1.5-32b/8b572c10-3553-4e51-a321-bdb05996914b.json
rename to data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json
index 74a01181c..699c1515b 100644
--- a/data/helm_lite/qwen/qwen1.5-32b/8b572c10-3553-4e51-a321-bdb05996914b.json
+++ b/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/qwen/qwen1.5-72b/6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json b/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json
similarity index 85%
rename from data/helm_lite/qwen/qwen1.5-72b/6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json
rename to data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json
index a056d0e42..8b347b68d 100644
--- a/data/helm_lite/qwen/qwen1.5-72b/6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json
+++ b/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/qwen/qwen1.5-7b/e0efe169-d28e-418e-a78c-9b04ec29aae2.json b/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json
similarity index 85%
rename from data/helm_lite/qwen/qwen1.5-7b/e0efe169-d28e-418e-a78c-9b04ec29aae2.json
rename to data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json
index 0757d65b1..b1bc89d92 100644
--- a/data/helm_lite/qwen/qwen1.5-7b/e0efe169-d28e-418e-a78c-9b04ec29aae2.json
+++ b/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/qwen/qwen2-72b-instruct/05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json b/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json
similarity index 85%
rename from data/helm_lite/qwen/qwen2-72b-instruct/05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json
rename to data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json
index 2d8d0469d..58edcde03 100644
--- a/data/helm_lite/qwen/qwen2-72b-instruct/05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json
+++ b/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/983696ae-d7f3-48a4-b7a0-a42487728182.json b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json
similarity index 86%
rename from data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/983696ae-d7f3-48a4-b7a0-a42487728182.json
rename to data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json
index 6091d879d..3e08a0cdf 100644
--- a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/983696ae-d7f3-48a4-b7a0-a42487728182.json
+++ b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -519,7 +519,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -528,7 +528,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -576,7 +576,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -585,7 +585,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/a969e516-adef-4839-9252-244c58ab3c67.json b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json
similarity index 86%
rename from data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/a969e516-adef-4839-9252-244c58ab3c67.json
rename to data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json
index a9b9ae2a3..3f844c281 100644
--- a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/a969e516-adef-4839-9252-244c58ab3c67.json
+++ b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -519,7 +519,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -528,7 +528,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -576,7 +576,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -585,7 +585,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/snowflake/snowflake-arctic-instruct/f122f9de-b1ce-40ea-8731-6c00c7af0498.json b/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json
similarity index 86%
rename from data/helm_lite/snowflake/snowflake-arctic-instruct/f122f9de-b1ce-40ea-8731-6c00c7af0498.json
rename to data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json
index f7f93c913..09f377d89 100644
--- a/data/helm_lite/snowflake/snowflake-arctic-instruct/f122f9de-b1ce-40ea-8731-6c00c7af0498.json
+++ b/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/tiiuae/falcon-40b/5c7982c5-3513-4ff2-9857-33a0db825376.json b/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json
similarity index 85%
rename from data/helm_lite/tiiuae/falcon-40b/5c7982c5-3513-4ff2-9857-33a0db825376.json
rename to data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json
index 65a14de91..2bf240f96 100644
--- a/data/helm_lite/tiiuae/falcon-40b/5c7982c5-3513-4ff2-9857-33a0db825376.json
+++ b/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/tiiuae_falcon-40b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/tiiuae_falcon-40b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/tiiuae/falcon-7b/4910859a-750c-4728-bf30-309e0e81690e.json b/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json
similarity index 85%
rename from data/helm_lite/tiiuae/falcon-7b/4910859a-750c-4728-bf30-309e0e81690e.json
rename to data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json
index 62d1fae1c..9a704269c 100644
--- a/data/helm_lite/tiiuae/falcon-7b/4910859a-750c-4728-bf30-309e0e81690e.json
+++ b/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/tiiuae_falcon-7b/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/tiiuae_falcon-7b/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/upstage/solar-pro-241126/32f0532f-b504-492d-84d7-f541930edad0.json b/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json
similarity index 85%
rename from data/helm_lite/upstage/solar-pro-241126/32f0532f-b504-492d-84d7-f541930edad0.json
rename to data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json
index 9e56dbbb6..1f111d01c 100644
--- a/data/helm_lite/upstage/solar-pro-241126/32f0532f-b504-492d-84d7-f541930edad0.json
+++ b/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/upstage_solar-pro-241126/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/upstage_solar-pro-241126/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -453,7 +453,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -462,7 +462,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -518,7 +518,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -527,7 +527,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -575,7 +575,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -584,7 +584,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/writer/palmyra-x-004/04c187a3-4532-4523-b39d-19314d61c779.json b/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json
similarity index 85%
rename from data/helm_lite/writer/palmyra-x-004/04c187a3-4532-4523-b39d-19314d61c779.json
rename to data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json
index 2b000451d..8026be475 100644
--- a/data/helm_lite/writer/palmyra-x-004/04c187a3-4532-4523-b39d-19314d61c779.json
+++ b/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/writer_palmyra-x-004/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/writer_palmyra-x-004/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -111,7 +111,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -120,7 +120,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -201,7 +201,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -210,7 +210,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -261,7 +261,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -270,7 +270,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -327,7 +327,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -336,7 +336,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -398,7 +398,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -407,7 +407,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -457,7 +457,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -466,7 +466,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -523,7 +523,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -532,7 +532,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -580,7 +580,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -589,7 +589,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/writer/palmyra-x-v2/4440532c-9b49-4c9a-8bf4-f122531c54fa.json b/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json
similarity index 85%
rename from data/helm_lite/writer/palmyra-x-v2/4440532c-9b49-4c9a-8bf4-f122531c54fa.json
rename to data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json
index fc600d1dc..5e5faf9fb 100644
--- a/data/helm_lite/writer/palmyra-x-v2/4440532c-9b49-4c9a-8bf4-f122531c54fa.json
+++ b/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/writer_palmyra-x-v2/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/writer_palmyra-x-v2/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_lite/writer/palmyra-x-v3/bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json b/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json
similarity index 85%
rename from data/helm_lite/writer/palmyra-x-v3/bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json
rename to data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json
index 3ac2641c0..c8073d254 100644
--- a/data/helm_lite/writer/palmyra-x-v3/bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json
+++ b/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/writer_palmyra-x-v3/1770829788.2883599",
-  "retrieved_timestamp": "1770829788.2883599",
+  "evaluation_id": "helm_lite/writer_palmyra-x-v3/1770834614.1822479",
+  "retrieved_timestamp": "1770834614.1822479",
   "source_metadata": {
     "source_name": "helm_lite",
     "source_type": "documentation",
@@ -52,7 +52,7 @@
       }
     },
     {
-      "evaluation_name": "NarrativeQA - F1",
+      "evaluation_name": "NarrativeQA",
       "source_data": {
         "dataset_name": "NarrativeQA",
         "source_type": "url",
@@ -61,7 +61,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NarrativeQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -109,7 +109,7 @@
       }
     },
     {
-      "evaluation_name": "NaturalQuestions (closed-book) - F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
       "source_data": {
         "dataset_name": "NaturalQuestions (closed-book)",
         "source_type": "url",
@@ -118,7 +118,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
+        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -198,7 +198,7 @@
       }
     },
     {
-      "evaluation_name": "OpenbookQA - EM",
+      "evaluation_name": "OpenbookQA",
       "source_data": {
         "dataset_name": "OpenbookQA",
         "source_type": "url",
@@ -207,7 +207,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on OpenbookQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -258,7 +258,7 @@
       }
     },
     {
-      "evaluation_name": "MMLU - EM",
+      "evaluation_name": "MMLU",
       "source_data": {
         "dataset_name": "MMLU",
         "source_type": "url",
@@ -267,7 +267,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -324,7 +324,7 @@
       }
     },
     {
-      "evaluation_name": "MATH - Equivalent (CoT)",
+      "evaluation_name": "MATH",
       "source_data": {
         "dataset_name": "MATH",
         "source_type": "url",
@@ -333,7 +333,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
+        "evaluation_description": "Equivalent (CoT) on MATH",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -394,7 +394,7 @@
       }
     },
     {
-      "evaluation_name": "GSM8K - EM",
+      "evaluation_name": "GSM8K",
       "source_data": {
         "dataset_name": "GSM8K",
         "source_type": "url",
@@ -403,7 +403,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
+        "evaluation_description": "EM on GSM8K",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -451,7 +451,7 @@
       }
     },
     {
-      "evaluation_name": "LegalBench - EM",
+      "evaluation_name": "LegalBench",
       "source_data": {
         "dataset_name": "LegalBench",
         "source_type": "url",
@@ -460,7 +460,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on LegalBench",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_name": "MedQA - EM",
+      "evaluation_name": "MedQA",
       "source_data": {
         "dataset_name": "MedQA",
         "source_type": "url",
@@ -525,7 +525,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
+        "evaluation_description": "EM on MedQA",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -573,7 +573,7 @@
       }
     },
     {
-      "evaluation_name": "WMT 2014 - BLEU-4",
+      "evaluation_name": "WMT 2014",
       "source_data": {
         "dataset_name": "WMT 2014",
         "source_type": "url",
@@ -582,7 +582,7 @@
         ]
       },
       "metric_config": {
-        "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
+        "evaluation_description": "BLEU-4 on WMT 2014",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/01-ai/yi-34b/3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json b/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json
similarity index 90%
rename from data/helm_mmlu/01-ai/yi-34b/3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json
rename to data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json
index cdb3ca461..a5d4de71f 100644
--- a/data/helm_mmlu/01-ai/yi-34b/3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json
+++ b/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/01-ai_yi-34b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/01-ai_yi-34b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/01-ai/yi-6b/6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json b/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json
similarity index 90%
rename from data/helm_mmlu/01-ai/yi-6b/6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json
rename to data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json
index 1b8b7e56f..1f0a7e20f 100644
--- a/data/helm_mmlu/01-ai/yi-6b/6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json
+++ b/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/01-ai_yi-6b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/01-ai_yi-6b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/01-ai/yi-large-preview/3d0b3d68-a853-4989-a35e-83ac6722c2da.json b/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json
similarity index 90%
rename from data/helm_mmlu/01-ai/yi-large-preview/3d0b3d68-a853-4989-a35e-83ac6722c2da.json
rename to data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json
index 29bc15bb3..4838cda1c 100644
--- a/data/helm_mmlu/01-ai/yi-large-preview/3d0b3d68-a853-4989-a35e-83ac6722c2da.json
+++ b/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/ai21/jamba-1.5-large/ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json b/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json
similarity index 90%
rename from data/helm_mmlu/ai21/jamba-1.5-large/ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json
rename to data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json
index 2bf971f25..45536e1a1 100644
--- a/data/helm_mmlu/ai21/jamba-1.5-large/ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json
+++ b/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/ai21/jamba-1.5-mini/517e8027-6edd-482b-86f3-33b6c41a9609.json b/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json
similarity index 90%
rename from data/helm_mmlu/ai21/jamba-1.5-mini/517e8027-6edd-482b-86f3-33b6c41a9609.json
rename to data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json
index 7ffc27970..727c60261 100644
--- a/data/helm_mmlu/ai21/jamba-1.5-mini/517e8027-6edd-482b-86f3-33b6c41a9609.json
+++ b/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/ai21/jamba-instruct/f7c1c125-ad0f-4847-b880-4f705f1666c6.json b/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json
similarity index 90%
rename from data/helm_mmlu/ai21/jamba-instruct/f7c1c125-ad0f-4847-b880-4f705f1666c6.json
rename to data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json
index 92ba45d60..3a25316d9 100644
--- a/data/helm_mmlu/ai21/jamba-instruct/f7c1c125-ad0f-4847-b880-4f705f1666c6.json
+++ b/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/allenai/olmo-1.7-7b/5a0ba280-8a12-4735-9d92-4ed71ba395b4.json b/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json
similarity index 90%
rename from data/helm_mmlu/allenai/olmo-1.7-7b/5a0ba280-8a12-4735-9d92-4ed71ba395b4.json
rename to data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json
index e53150712..8bf036c64 100644
--- a/data/helm_mmlu/allenai/olmo-1.7-7b/5a0ba280-8a12-4735-9d92-4ed71ba395b4.json
+++ b/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/allenai/olmo-7b/73ccc6a6-e10d-4619-914f-26032cddf8da.json b/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json
similarity index 90%
rename from data/helm_mmlu/allenai/olmo-7b/73ccc6a6-e10d-4619-914f-26032cddf8da.json
rename to data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json
index 301523f0f..2b8d4cdfb 100644
--- a/data/helm_mmlu/allenai/olmo-7b/73ccc6a6-e10d-4619-914f-26032cddf8da.json
+++ b/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/allenai_olmo-7b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/allenai_olmo-7b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/amazon/nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json b/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json
similarity index 90%
rename from data/helm_mmlu/amazon/nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json
rename to data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json
index d80215b78..1bb99dccc 100644
--- a/data/helm_mmlu/amazon/nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json
+++ b/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/amazon/nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json b/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json
similarity index 90%
rename from data/helm_mmlu/amazon/nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json
rename to data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json
index f28fc4ccf..ab9b8c843 100644
--- a/data/helm_mmlu/amazon/nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json
+++ b/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/amazon/nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json b/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json
similarity index 90%
rename from data/helm_mmlu/amazon/nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json
rename to data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json
index 66455ef1d..af30c4448 100644
--- a/data/helm_mmlu/amazon/nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json
+++ b/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/anthropic/claude-2.1/aa8cae95-cb75-4241-951c-25e2046042dd.json b/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json
similarity index 90%
rename from data/helm_mmlu/anthropic/claude-2.1/aa8cae95-cb75-4241-951c-25e2046042dd.json
rename to data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json
index 163a9d31a..c2616d7f8 100644
--- a/data/helm_mmlu/anthropic/claude-2.1/aa8cae95-cb75-4241-951c-25e2046042dd.json
+++ b/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/c88e4a03-22ae-4338-bf5f-36070814136a.json b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json
similarity index 90%
rename from data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/c88e4a03-22ae-4338-bf5f-36070814136a.json
rename to data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json
index edabc3b81..76628bf51 100644
--- a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/c88e4a03-22ae-4338-bf5f-36070814136a.json
+++ b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json
similarity index 90%
rename from data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json
rename to data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json
index 8d402d4fb..9d9557efc 100644
--- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json
+++ b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json
similarity index 90%
rename from data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json
rename to data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json
index a435d5c4d..35be68aa6 100644
--- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json
+++ b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/097a8da1-f411-4359-8440-2ab06f4ae76c.json b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json
similarity index 90%
rename from data/helm_mmlu/anthropic/claude-3-haiku-20240307/097a8da1-f411-4359-8440-2ab06f4ae76c.json
rename to data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json
index 66b68fa6b..969900aba 100644
--- a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/097a8da1-f411-4359-8440-2ab06f4ae76c.json
+++ b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/anthropic/claude-3-opus-20240229/68130abd-1df5-4cd3-919a-2863e9f013c7.json b/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json
similarity index 90%
rename from data/helm_mmlu/anthropic/claude-3-opus-20240229/68130abd-1df5-4cd3-919a-2863e9f013c7.json
rename to data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json
index 140c0db28..230be4291 100644
--- a/data/helm_mmlu/anthropic/claude-3-opus-20240229/68130abd-1df5-4cd3-919a-2863e9f013c7.json
+++ b/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/5d8d795a-d213-4b96-9b17-ad5fae6b3687.json b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json
similarity index 90%
rename from data/helm_mmlu/anthropic/claude-3-sonnet-20240229/5d8d795a-d213-4b96-9b17-ad5fae6b3687.json
rename to data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json
index a6eb131df..dd7543ecb 100644
--- a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/5d8d795a-d213-4b96-9b17-ad5fae6b3687.json
+++ b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/anthropic/claude-instant-1.2/7908da03-f030-4c62-a121-c04bd94ea75e.json b/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json
similarity index 90%
rename from data/helm_mmlu/anthropic/claude-instant-1.2/7908da03-f030-4c62-a121-c04bd94ea75e.json
rename to data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json
index 38a7ffacb..c9e9779b1 100644
--- a/data/helm_mmlu/anthropic/claude-instant-1.2/7908da03-f030-4c62-a121-c04bd94ea75e.json
+++ b/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/cohere/command-r-plus/c6fdbf96-2500-4410-8fcd-268ea3e16062.json b/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json
similarity index 90%
rename from data/helm_mmlu/cohere/command-r-plus/c6fdbf96-2500-4410-8fcd-268ea3e16062.json
rename to data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json
index 4b85be9b2..6bebd236d 100644
--- a/data/helm_mmlu/cohere/command-r-plus/c6fdbf96-2500-4410-8fcd-268ea3e16062.json
+++ b/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/cohere_command-r-plus/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/cohere_command-r-plus/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/cohere/command-r/537164c3-7b88-4543-b19d-370f55a25a66.json b/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json
similarity index 90%
rename from data/helm_mmlu/cohere/command-r/537164c3-7b88-4543-b19d-370f55a25a66.json
rename to data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json
index 90cbd571c..e82639d82 100644
--- a/data/helm_mmlu/cohere/command-r/537164c3-7b88-4543-b19d-370f55a25a66.json
+++ b/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/cohere_command-r/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/cohere_command-r/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/databricks/dbrx-instruct/0c539e26-8403-42db-acfc-7953dd80ae20.json b/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json
similarity index 90%
rename from data/helm_mmlu/databricks/dbrx-instruct/0c539e26-8403-42db-acfc-7953dd80ae20.json
rename to data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json
index 753506525..d5f73b61f 100644
--- a/data/helm_mmlu/databricks/dbrx-instruct/0c539e26-8403-42db-acfc-7953dd80ae20.json
+++ b/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/364c7490-8bb1-4e7e-b485-fb3c2224da58.json b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json
similarity index 90%
rename from data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/364c7490-8bb1-4e7e-b485-fb3c2224da58.json
rename to data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json
index f12e3799a..7ec071041 100644
--- a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/364c7490-8bb1-4e7e-b485-fb3c2224da58.json
+++ b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/deepseek-ai/deepseek-v3/1a9167d2-882c-4582-b4e0-ac425896a317.json b/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json
similarity index 90%
rename from data/helm_mmlu/deepseek-ai/deepseek-v3/1a9167d2-882c-4582-b4e0-ac425896a317.json
rename to data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json
index 86fd9dec9..200a6e19c 100644
--- a/data/helm_mmlu/deepseek-ai/deepseek-v3/1a9167d2-882c-4582-b4e0-ac425896a317.json
+++ b/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/gemini-1.0-pro-001/8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json b/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json
similarity index 90%
rename from data/helm_mmlu/google/gemini-1.0-pro-001/8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json
rename to data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json
index 0184241c6..86096274a 100644
--- a/data/helm_mmlu/google/gemini-1.0-pro-001/8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json
+++ b/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/gemini-1.5-flash-001/d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json b/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json
similarity index 90%
rename from data/helm_mmlu/google/gemini-1.5-flash-001/d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json
rename to data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json
index 7baa6457e..7aac2d734 100644
--- a/data/helm_mmlu/google/gemini-1.5-flash-001/d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json
+++ b/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/gemini-1.5-flash-002/a94c9e13-dca7-4e02-a795-09d9274354d3.json b/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json
similarity index 90%
rename from data/helm_mmlu/google/gemini-1.5-flash-002/a94c9e13-dca7-4e02-a795-09d9274354d3.json
rename to data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json
index f095d6361..a87c94c3b 100644
--- a/data/helm_mmlu/google/gemini-1.5-flash-002/a94c9e13-dca7-4e02-a795-09d9274354d3.json
+++ b/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/75c8b20f-a4d4-4699-be79-f027bf7f0d69.json b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json
similarity index 90%
rename from data/helm_mmlu/google/gemini-1.5-flash-preview-0514/75c8b20f-a4d4-4699-be79-f027bf7f0d69.json
rename to data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json
index fe99bd4e4..b8d59d877 100644
--- a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/75c8b20f-a4d4-4699-be79-f027bf7f0d69.json
+++ b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/gemini-1.5-pro-001/264be7b4-08b7-40b6-a5e7-f3536f361450.json b/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json
similarity index 90%
rename from data/helm_mmlu/google/gemini-1.5-pro-001/264be7b4-08b7-40b6-a5e7-f3536f361450.json
rename to data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json
index 4b9fc2846..0632aee68 100644
--- a/data/helm_mmlu/google/gemini-1.5-pro-001/264be7b4-08b7-40b6-a5e7-f3536f361450.json
+++ b/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/gemini-1.5-pro-002/83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json b/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json
similarity index 90%
rename from data/helm_mmlu/google/gemini-1.5-pro-002/83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json
rename to data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json
index 47f80252d..d6a3ba87a 100644
--- a/data/helm_mmlu/google/gemini-1.5-pro-002/83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json
+++ b/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/8a013eb3-0f21-4a50-8a53-4ba977951130.json b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json
similarity index 90%
rename from data/helm_mmlu/google/gemini-1.5-pro-preview-0409/8a013eb3-0f21-4a50-8a53-4ba977951130.json
rename to data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json
index 901c1dd01..de3a77c03 100644
--- a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/8a013eb3-0f21-4a50-8a53-4ba977951130.json
+++ b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/gemini-2.0-flash-exp/7b081a40-7cb6-4405-b842-3db95f290dfa.json b/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json
similarity index 90%
rename from data/helm_mmlu/google/gemini-2.0-flash-exp/7b081a40-7cb6-4405-b842-3db95f290dfa.json
rename to data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json
index 0eda6b6b1..6b53de064 100644
--- a/data/helm_mmlu/google/gemini-2.0-flash-exp/7b081a40-7cb6-4405-b842-3db95f290dfa.json
+++ b/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/gemma-2-27b/54185b53-9891-43c6-8f93-09ff02b728d8.json b/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json
similarity index 90%
rename from data/helm_mmlu/google/gemma-2-27b/54185b53-9891-43c6-8f93-09ff02b728d8.json
rename to data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json
index 142296fc4..8720cc062 100644
--- a/data/helm_mmlu/google/gemma-2-27b/54185b53-9891-43c6-8f93-09ff02b728d8.json
+++ b/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemma-2-27b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_gemma-2-27b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/gemma-2-9b/884c194d-6519-4bd4-8add-6514e593c514.json b/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json
similarity index 90%
rename from data/helm_mmlu/google/gemma-2-9b/884c194d-6519-4bd4-8add-6514e593c514.json
rename to data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json
index 6f84fd47f..2007b06df 100644
--- a/data/helm_mmlu/google/gemma-2-9b/884c194d-6519-4bd4-8add-6514e593c514.json
+++ b/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemma-2-9b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_gemma-2-9b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/gemma-7b/a80cbd76-bcf8-4174-b0b3-346fae152bdb.json b/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json
similarity index 90%
rename from data/helm_mmlu/google/gemma-7b/a80cbd76-bcf8-4174-b0b3-346fae152bdb.json
rename to data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json
index ac525859f..963d13c9a 100644
--- a/data/helm_mmlu/google/gemma-7b/a80cbd76-bcf8-4174-b0b3-346fae152bdb.json
+++ b/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemma-7b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_gemma-7b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/text-bison@001/5f105986-aa7d-4858-91bc-cece9d0085ba.json b/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json
similarity index 90%
rename from data/helm_mmlu/google/text-bison@001/5f105986-aa7d-4858-91bc-cece9d0085ba.json
rename to data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json
index b20dbe54d..c0271bcb3 100644
--- a/data/helm_mmlu/google/text-bison@001/5f105986-aa7d-4858-91bc-cece9d0085ba.json
+++ b/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_text-bison@001/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_text-bison@001/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/google/text-unicorn@001/528b7b4e-c8a6-4387-bd98-497a3316029d.json b/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json
similarity index 90%
rename from data/helm_mmlu/google/text-unicorn@001/528b7b4e-c8a6-4387-bd98-497a3316029d.json
rename to data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json
index 7b3536f41..42c5040aa 100644
--- a/data/helm_mmlu/google/text-unicorn@001/528b7b4e-c8a6-4387-bd98-497a3316029d.json
+++ b/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_text-unicorn@001/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/google_text-unicorn@001/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/meta/llama-2-13b/96eb34db-66bd-4945-8b4c-a8c1394fe56a.json b/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json
similarity index 90%
rename from data/helm_mmlu/meta/llama-2-13b/96eb34db-66bd-4945-8b4c-a8c1394fe56a.json
rename to data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json
index a786ac0dd..453cd8b3a 100644
--- a/data/helm_mmlu/meta/llama-2-13b/96eb34db-66bd-4945-8b4c-a8c1394fe56a.json
+++ b/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-2-13b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/meta_llama-2-13b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/meta/llama-2-70b/961e917b-0e67-462c-b9d0-0fe4b4b85beb.json b/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json
similarity index 90%
rename from data/helm_mmlu/meta/llama-2-70b/961e917b-0e67-462c-b9d0-0fe4b4b85beb.json
rename to data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json
index bd988b6d8..aa6a9caa2 100644
--- a/data/helm_mmlu/meta/llama-2-70b/961e917b-0e67-462c-b9d0-0fe4b4b85beb.json
+++ b/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-2-70b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/meta_llama-2-70b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/meta/llama-2-7b/59a85d2c-16ce-4ed4-bc65-f6898127fa57.json b/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json
similarity index 90%
rename from data/helm_mmlu/meta/llama-2-7b/59a85d2c-16ce-4ed4-bc65-f6898127fa57.json
rename to data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json
index b29cd7460..0649e7329 100644
--- a/data/helm_mmlu/meta/llama-2-7b/59a85d2c-16ce-4ed4-bc65-f6898127fa57.json
+++ b/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-2-7b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/meta_llama-2-7b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/meta/llama-3-70b/16a8b446-51fc-4c23-9231-46ee16c1c0a8.json b/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json
similarity index 90%
rename from data/helm_mmlu/meta/llama-3-70b/16a8b446-51fc-4c23-9231-46ee16c1c0a8.json
rename to data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json
index d46d7f50a..4f09a5ee3 100644
--- a/data/helm_mmlu/meta/llama-3-70b/16a8b446-51fc-4c23-9231-46ee16c1c0a8.json
+++ b/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3-70b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/meta_llama-3-70b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/meta/llama-3-8b/f4de7e58-7060-440b-8f6f-1f79d7499d1e.json b/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json
similarity index 90%
rename from data/helm_mmlu/meta/llama-3-8b/f4de7e58-7060-440b-8f6f-1f79d7499d1e.json
rename to data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json
index 31dfddc02..83f907e80 100644
--- a/data/helm_mmlu/meta/llama-3-8b/f4de7e58-7060-440b-8f6f-1f79d7499d1e.json
+++ b/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3-8b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/meta_llama-3-8b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json
similarity index 90%
rename from data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json
rename to data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json
index 64eb43090..c4ce37e9d 100644
--- a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json
+++ b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/dc6aa933-67e4-4811-b3e2-e5200c002abe.json b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json
similarity index 90%
rename from data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/dc6aa933-67e4-4811-b3e2-e5200c002abe.json
rename to data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json
index 149eb0100..0e4b849f9 100644
--- a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/dc6aa933-67e4-4811-b3e2-e5200c002abe.json
+++ b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/5f9758a3-fd6d-4598-930a-9c01420d05e8.json b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json
similarity index 90%
rename from data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/5f9758a3-fd6d-4598-930a-9c01420d05e8.json
rename to data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json
index 46bd04117..6c1d661d4 100644
--- a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/5f9758a3-fd6d-4598-930a-9c01420d05e8.json
+++ b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/7592c0d8-a06c-4189-81a1-dbf794d22c8b.json b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json
similarity index 90%
rename from data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/7592c0d8-a06c-4189-81a1-dbf794d22c8b.json
rename to data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json
index 187d1c6a7..599cd6855 100644
--- a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/7592c0d8-a06c-4189-81a1-dbf794d22c8b.json
+++ b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/83c0e8e3-087c-4d61-9153-e571b4971871.json b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json
similarity index 90%
rename from data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/83c0e8e3-087c-4d61-9153-e571b4971871.json
rename to data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json
index 9625c1e16..f14700c78 100644
--- a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/83c0e8e3-087c-4d61-9153-e571b4971871.json
+++ b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json
similarity index 90%
rename from data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json
rename to data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json
index 8effae129..faf8ae128 100644
--- a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json
+++ b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/microsoft/phi-2/5baac093-babb-41cd-a2f4-985d0b91be37.json b/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json
similarity index 90%
rename from data/helm_mmlu/microsoft/phi-2/5baac093-babb-41cd-a2f4-985d0b91be37.json
rename to data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json
index 07027b7fb..95bd9f1b8 100644
--- a/data/helm_mmlu/microsoft/phi-2/5baac093-babb-41cd-a2f4-985d0b91be37.json
+++ b/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/microsoft_phi-2/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/microsoft_phi-2/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/1bf54088-ba12-45b4-8f80-63d5c38f58f6.json b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json
similarity index 90%
rename from data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/1bf54088-ba12-45b4-8f80-63d5c38f58f6.json
rename to data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json
index 6ce22179c..f1d62a268 100644
--- a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/1bf54088-ba12-45b4-8f80-63d5c38f58f6.json
+++ b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/5ed0a970-200f-4f23-9623-e714afa49ddf.json b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json
similarity index 90%
rename from data/helm_mmlu/microsoft/phi-3-small-8k-instruct/5ed0a970-200f-4f23-9623-e714afa49ddf.json
rename to data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json
index 7278b002a..bbe3afca0 100644
--- a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/5ed0a970-200f-4f23-9623-e714afa49ddf.json
+++ b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/e7fd06a6-65e5-4f88-8e86-c513f78e31db.json b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json
similarity index 90%
rename from data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/e7fd06a6-65e5-4f88-8e86-c513f78e31db.json
rename to data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json
index 886ff1732..e788149e1 100644
--- a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/e7fd06a6-65e5-4f88-8e86-c513f78e31db.json
+++ b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/mistralai/mistral-7b-v0.1/ac047aef-008f-4c87-a6d5-4f331ebf5c53.json b/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json
similarity index 90%
rename from data/helm_mmlu/mistralai/mistral-7b-v0.1/ac047aef-008f-4c87-a6d5-4f331ebf5c53.json
rename to data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json
index 935804d7f..5ca508d3b 100644
--- a/data/helm_mmlu/mistralai/mistral-7b-v0.1/ac047aef-008f-4c87-a6d5-4f331ebf5c53.json
+++ b/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/mistralai/mistral-large-2402/ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json b/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json
similarity index 90%
rename from data/helm_mmlu/mistralai/mistral-large-2402/ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json
rename to data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json
index bc72ce600..6b7873124 100644
--- a/data/helm_mmlu/mistralai/mistral-large-2402/ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json
+++ b/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/mistralai/mistral-large-2407/7517b6c9-c613-416c-aadb-39fd6d252da7.json b/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json
similarity index 90%
rename from data/helm_mmlu/mistralai/mistral-large-2407/7517b6c9-c613-416c-aadb-39fd6d252da7.json
rename to data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json
index 272dc142d..58aa6a379 100644
--- a/data/helm_mmlu/mistralai/mistral-large-2407/7517b6c9-c613-416c-aadb-39fd6d252da7.json
+++ b/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/mistralai/mistral-small-2402/85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json b/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json
similarity index 90%
rename from data/helm_mmlu/mistralai/mistral-small-2402/85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json
rename to data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json
index 607dc1e03..457d9ed2a 100644
--- a/data/helm_mmlu/mistralai/mistral-small-2402/85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json
+++ b/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/mistralai/mixtral-8x22b/df568c3c-8a5c-4455-836d-c980d7f5ea5c.json b/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json
similarity index 90%
rename from data/helm_mmlu/mistralai/mixtral-8x22b/df568c3c-8a5c-4455-836d-c980d7f5ea5c.json
rename to data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json
index 401d4b7c8..c7ab33c35 100644
--- a/data/helm_mmlu/mistralai/mixtral-8x22b/df568c3c-8a5c-4455-836d-c980d7f5ea5c.json
+++ b/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/96e24977-ca6d-402c-bfd8-62be4cd9b902.json b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json
similarity index 90%
rename from data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/96e24977-ca6d-402c-bfd8-62be4cd9b902.json
rename to data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json
index b88295eb7..3ed7c6104 100644
--- a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/96e24977-ca6d-402c-bfd8-62be4cd9b902.json
+++ b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e5b2636a-8438-40c0-9f89-9f35585bf740.json b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json
similarity index 90%
rename from data/helm_mmlu/mistralai/open-mistral-nemo-2407/e5b2636a-8438-40c0-9f89-9f35585bf740.json
rename to data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json
index 5a436d9c9..e5aec6b67 100644
--- a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e5b2636a-8438-40c0-9f89-9f35585bf740.json
+++ b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/f3259d92-3c95-4b78-81ae-f7f4b80aec63.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json
similarity index 90%
rename from data/helm_mmlu/openai/gpt-3.5-turbo-0125/f3259d92-3c95-4b78-81ae-f7f4b80aec63.json
rename to data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json
index 5923a61b0..e429d6dbc 100644
--- a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/f3259d92-3c95-4b78-81ae-f7f4b80aec63.json
+++ b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/5ba23a34-4232-487f-b3e9-326d776135be.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json
similarity index 90%
rename from data/helm_mmlu/openai/gpt-3.5-turbo-0613/5ba23a34-4232-487f-b3e9-326d776135be.json
rename to data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json
index c62c20e9c..92faf2169 100644
--- a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/5ba23a34-4232-487f-b3e9-326d776135be.json
+++ b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/openai/gpt-4-0613/5bc1a462-f753-4259-91c3-a549491b2986.json b/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json
similarity index 90%
rename from data/helm_mmlu/openai/gpt-4-0613/5bc1a462-f753-4259-91c3-a549491b2986.json
rename to data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json
index 9877671a2..6ccc418f3 100644
--- a/data/helm_mmlu/openai/gpt-4-0613/5bc1a462-f753-4259-91c3-a549491b2986.json
+++ b/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/openai/gpt-4-1106-preview/16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json b/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json
similarity index 90%
rename from data/helm_mmlu/openai/gpt-4-1106-preview/16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json
rename to data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json
index 448f5bbca..610be9719 100644
--- a/data/helm_mmlu/openai/gpt-4-1106-preview/16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json
+++ b/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json
similarity index 90%
rename from data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json
rename to data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json
index aefe21734..a348a9fb9 100644
--- a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json
+++ b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/openai/gpt-4o-2024-05-13/2ca11d4c-52e6-49ea-a5cb-238c0313c483.json b/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json
similarity index 90%
rename from data/helm_mmlu/openai/gpt-4o-2024-05-13/2ca11d4c-52e6-49ea-a5cb-238c0313c483.json
rename to data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json
index efc7bbe5a..76ba53d53 100644
--- a/data/helm_mmlu/openai/gpt-4o-2024-05-13/2ca11d4c-52e6-49ea-a5cb-238c0313c483.json
+++ b/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/openai/gpt-4o-2024-08-06/de400624-6c2e-47af-b851-54c4075c30ee.json b/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json
similarity index 90%
rename from data/helm_mmlu/openai/gpt-4o-2024-08-06/de400624-6c2e-47af-b851-54c4075c30ee.json
rename to data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json
index fe9568710..2d538eb02 100644
--- a/data/helm_mmlu/openai/gpt-4o-2024-08-06/de400624-6c2e-47af-b851-54c4075c30ee.json
+++ b/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/34441b3b-4d66-444c-af85-ca0666a48ed4.json b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json
similarity index 90%
rename from data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/34441b3b-4d66-444c-af85-ca0666a48ed4.json
rename to data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json
index 681eae3b7..7753003a8 100644
--- a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/34441b3b-4d66-444c-af85-ca0666a48ed4.json
+++ b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/qwen/qwen1.5-110b-chat/eecf5e40-9110-47ea-a72b-9ba587b96e30.json b/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json
similarity index 90%
rename from data/helm_mmlu/qwen/qwen1.5-110b-chat/eecf5e40-9110-47ea-a72b-9ba587b96e30.json
rename to data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json
index 6667a05bb..4b924f5af 100644
--- a/data/helm_mmlu/qwen/qwen1.5-110b-chat/eecf5e40-9110-47ea-a72b-9ba587b96e30.json
+++ b/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/qwen/qwen1.5-14b/f26fb123-c214-4d18-aea8-b05b4ea1819b.json b/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json
similarity index 90%
rename from data/helm_mmlu/qwen/qwen1.5-14b/f26fb123-c214-4d18-aea8-b05b4ea1819b.json
rename to data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json
index ce5d472c6..9bfc87f91 100644
--- a/data/helm_mmlu/qwen/qwen1.5-14b/f26fb123-c214-4d18-aea8-b05b4ea1819b.json
+++ b/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/qwen/qwen1.5-32b/30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json b/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json
similarity index 90%
rename from data/helm_mmlu/qwen/qwen1.5-32b/30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json
rename to data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json
index ff8059b60..d1a9f19e1 100644
--- a/data/helm_mmlu/qwen/qwen1.5-32b/30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json
+++ b/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/qwen/qwen1.5-72b/b152cd5c-cbc0-48f4-ba37-16878c3afba1.json b/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json
similarity index 90%
rename from data/helm_mmlu/qwen/qwen1.5-72b/b152cd5c-cbc0-48f4-ba37-16878c3afba1.json
rename to data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json
index c69a6d09c..94c5e4e80 100644
--- a/data/helm_mmlu/qwen/qwen1.5-72b/b152cd5c-cbc0-48f4-ba37-16878c3afba1.json
+++ b/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/qwen/qwen1.5-7b/dac223e9-3073-46f9-924b-c5a6408f5da9.json b/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json
similarity index 90%
rename from data/helm_mmlu/qwen/qwen1.5-7b/dac223e9-3073-46f9-924b-c5a6408f5da9.json
rename to data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json
index 8651674c9..166da7894 100644
--- a/data/helm_mmlu/qwen/qwen1.5-7b/dac223e9-3073-46f9-924b-c5a6408f5da9.json
+++ b/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/qwen/qwen2-72b-instruct/a7a218ff-7afe-417c-ac39-cf305d592d56.json b/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json
similarity index 90%
rename from data/helm_mmlu/qwen/qwen2-72b-instruct/a7a218ff-7afe-417c-ac39-cf305d592d56.json
rename to data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json
index 89026d1dc..6f8b955e0 100644
--- a/data/helm_mmlu/qwen/qwen2-72b-instruct/a7a218ff-7afe-417c-ac39-cf305d592d56.json
+++ b/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/2e165735-43b8-4317-9cde-35aa4b5bcb26.json b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json
similarity index 90%
rename from data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/2e165735-43b8-4317-9cde-35aa4b5bcb26.json
rename to data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json
index 1a03b982a..a61d620fd 100644
--- a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/2e165735-43b8-4317-9cde-35aa4b5bcb26.json
+++ b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/15c25bc5-7b1e-4771-bda2-fd04d74e1463.json b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json
similarity index 90%
rename from data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/15c25bc5-7b1e-4771-bda2-fd04d74e1463.json
rename to data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json
index 032da16a1..c045e519d 100644
--- a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/15c25bc5-7b1e-4771-bda2-fd04d74e1463.json
+++ b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/26036c7c-e981-46e8-b5e9-dcd7d116af70.json b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json
similarity index 90%
rename from data/helm_mmlu/snowflake/snowflake-arctic-instruct/26036c7c-e981-46e8-b5e9-dcd7d116af70.json
rename to data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json
index 5482f32f0..0afa77758 100644
--- a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/26036c7c-e981-46e8-b5e9-dcd7d116af70.json
+++ b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/upstage/solar-pro-241126/b3269e4e-98a7-4795-8ef3-fc87774a54b7.json b/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json
similarity index 90%
rename from data/helm_mmlu/upstage/solar-pro-241126/b3269e4e-98a7-4795-8ef3-fc87774a54b7.json
rename to data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json
index b71ad83e6..2c0cfc48a 100644
--- a/data/helm_mmlu/upstage/solar-pro-241126/b3269e4e-98a7-4795-8ef3-fc87774a54b7.json
+++ b/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/writer/palmyra-x-004/284fde9f-8570-4e6d-9190-e52d8723fe57.json b/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json
similarity index 90%
rename from data/helm_mmlu/writer/palmyra-x-004/284fde9f-8570-4e6d-9190-e52d8723fe57.json
rename to data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json
index 734ce34f3..c204b253d 100644
--- a/data/helm_mmlu/writer/palmyra-x-004/284fde9f-8570-4e6d-9190-e52d8723fe57.json
+++ b/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/data/helm_mmlu/writer/palmyra-x-v3/fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json b/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json
similarity index 90%
rename from data/helm_mmlu/writer/palmyra-x-v3/fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json
rename to data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json
index 0e5669e0a..2eef769c8 100644
--- a/data/helm_mmlu/writer/palmyra-x-v3/fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json
+++ b/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json
@@ -1,7 +1,7 @@
 {
   "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1770830564.5477738",
-  "retrieved_timestamp": "1770830564.5477738",
+  "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1770835937.459157",
+  "retrieved_timestamp": "1770835937.459157",
   "source_metadata": {
     "source_name": "helm_mmlu",
     "source_type": "documentation",
@@ -16,16 +16,16 @@
   },
   "evaluation_results": [
     {
-      "evaluation_name": "MMLU All Subjects - EM",
+      "evaluation_name": "MMLU All Subjects",
       "source_data": {
-        "dataset_name": "MMLU All Subjects",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on MMLU All Subjects",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -194,16 +194,16 @@
       }
     },
     {
-      "evaluation_name": "Abstract Algebra - EM",
+      "evaluation_name": "Abstract Algebra",
       "source_data": {
-        "dataset_name": "Abstract Algebra",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Abstract Algebra",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -256,16 +256,16 @@
       }
     },
     {
-      "evaluation_name": "Anatomy - EM",
+      "evaluation_name": "Anatomy",
       "source_data": {
-        "dataset_name": "Anatomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Anatomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -318,16 +318,16 @@
       }
     },
     {
-      "evaluation_name": "College Physics - EM",
+      "evaluation_name": "College Physics",
       "source_data": {
-        "dataset_name": "College Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on College Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -530,16 +530,16 @@
       }
     },
     {
-      "evaluation_name": "Computer Security - EM",
+      "evaluation_name": "Computer Security",
       "source_data": {
-        "dataset_name": "Computer Security",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Computer Security",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -592,16 +592,16 @@
       }
     },
     {
-      "evaluation_name": "Econometrics - EM",
+      "evaluation_name": "Econometrics",
       "source_data": {
-        "dataset_name": "Econometrics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Econometrics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -654,16 +654,16 @@
       }
     },
     {
-      "evaluation_name": "Global Facts - EM",
+      "evaluation_name": "Global Facts",
       "source_data": {
-        "dataset_name": "Global Facts",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Global Facts",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -716,16 +716,16 @@
       }
     },
     {
-      "evaluation_name": "Jurisprudence - EM",
+      "evaluation_name": "Jurisprudence",
       "source_data": {
-        "dataset_name": "Jurisprudence",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Jurisprudence",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -778,16 +778,16 @@
       }
     },
     {
-      "evaluation_name": "Philosophy - EM",
+      "evaluation_name": "Philosophy",
       "source_data": {
-        "dataset_name": "Philosophy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Philosophy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -840,16 +840,16 @@
       }
     },
     {
-      "evaluation_name": "Professional Psychology - EM",
+      "evaluation_name": "Professional Psychology",
       "source_data": {
-        "dataset_name": "Professional Psychology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Professional Psychology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -992,16 +992,16 @@
       }
     },
     {
-      "evaluation_name": "Us Foreign Policy - EM",
+      "evaluation_name": "Us Foreign Policy",
       "source_data": {
-        "dataset_name": "Us Foreign Policy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Us Foreign Policy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1054,16 +1054,16 @@
       }
     },
     {
-      "evaluation_name": "Astronomy - EM",
+      "evaluation_name": "Astronomy",
       "source_data": {
-        "dataset_name": "Astronomy",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Astronomy",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1116,16 +1116,16 @@
       }
     },
     {
-      "evaluation_name": "Business Ethics - EM",
+      "evaluation_name": "Business Ethics",
       "source_data": {
-        "dataset_name": "Business Ethics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Business Ethics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1178,16 +1178,16 @@
       }
     },
     {
-      "evaluation_name": "Clinical Knowledge - EM",
+      "evaluation_name": "Clinical Knowledge",
       "source_data": {
-        "dataset_name": "Clinical Knowledge",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Clinical Knowledge",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1240,16 +1240,16 @@
       }
     },
     {
-      "evaluation_name": "Conceptual Physics - EM",
+      "evaluation_name": "Conceptual Physics",
       "source_data": {
-        "dataset_name": "Conceptual Physics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Conceptual Physics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1302,16 +1302,16 @@
       }
     },
     {
-      "evaluation_name": "Electrical Engineering - EM",
+      "evaluation_name": "Electrical Engineering",
       "source_data": {
-        "dataset_name": "Electrical Engineering",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Electrical Engineering",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1364,16 +1364,16 @@
       }
     },
     {
-      "evaluation_name": "Elementary Mathematics - EM",
+      "evaluation_name": "Elementary Mathematics",
       "source_data": {
-        "dataset_name": "Elementary Mathematics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Elementary Mathematics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1426,16 +1426,16 @@
       }
     },
     {
-      "evaluation_name": "Formal Logic - EM",
+      "evaluation_name": "Formal Logic",
       "source_data": {
-        "dataset_name": "Formal Logic",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Formal Logic",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1488,16 +1488,16 @@
       }
     },
     {
-      "evaluation_name": "High School World History - EM",
+      "evaluation_name": "High School World History",
       "source_data": {
-        "dataset_name": "High School World History",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on High School World History",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -1940,16 +1940,16 @@
       }
     },
     {
-      "evaluation_name": "Human Sexuality - EM",
+      "evaluation_name": "Human Sexuality",
       "source_data": {
-        "dataset_name": "Human Sexuality",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Human Sexuality",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2032,16 +2032,16 @@
       }
     },
     {
-      "evaluation_name": "International Law - EM",
+      "evaluation_name": "International Law",
       "source_data": {
-        "dataset_name": "International Law",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on International Law",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2094,16 +2094,16 @@
       }
     },
     {
-      "evaluation_name": "Logical Fallacies - EM",
+      "evaluation_name": "Logical Fallacies",
       "source_data": {
-        "dataset_name": "Logical Fallacies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Logical Fallacies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2156,16 +2156,16 @@
       }
     },
     {
-      "evaluation_name": "Machine Learning - EM",
+      "evaluation_name": "Machine Learning",
       "source_data": {
-        "dataset_name": "Machine Learning",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Machine Learning",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2218,16 +2218,16 @@
       }
     },
     {
-      "evaluation_name": "Management - EM",
+      "evaluation_name": "Management",
       "source_data": {
-        "dataset_name": "Management",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Management",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2280,16 +2280,16 @@
       }
     },
     {
-      "evaluation_name": "Marketing - EM",
+      "evaluation_name": "Marketing",
       "source_data": {
-        "dataset_name": "Marketing",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Marketing",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2342,16 +2342,16 @@
       }
     },
     {
-      "evaluation_name": "Medical Genetics - EM",
+      "evaluation_name": "Medical Genetics",
       "source_data": {
-        "dataset_name": "Medical Genetics",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Medical Genetics",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2404,16 +2404,16 @@
       }
     },
     {
-      "evaluation_name": "Miscellaneous - EM",
+      "evaluation_name": "Miscellaneous",
       "source_data": {
-        "dataset_name": "Miscellaneous",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Miscellaneous",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2466,16 +2466,16 @@
       }
     },
     {
-      "evaluation_name": "Moral Scenarios - EM",
+      "evaluation_name": "Moral Scenarios",
       "source_data": {
-        "dataset_name": "Moral Scenarios",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Moral Scenarios",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2558,16 +2558,16 @@
       }
     },
     {
-      "evaluation_name": "Nutrition - EM",
+      "evaluation_name": "Nutrition",
       "source_data": {
-        "dataset_name": "Nutrition",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Nutrition",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2620,16 +2620,16 @@
       }
     },
     {
-      "evaluation_name": "Prehistory - EM",
+      "evaluation_name": "Prehistory",
       "source_data": {
-        "dataset_name": "Prehistory",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Prehistory",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2682,16 +2682,16 @@
       }
     },
     {
-      "evaluation_name": "Public Relations - EM",
+      "evaluation_name": "Public Relations",
       "source_data": {
-        "dataset_name": "Public Relations",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Public Relations",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2744,16 +2744,16 @@
       }
     },
     {
-      "evaluation_name": "Security Studies - EM",
+      "evaluation_name": "Security Studies",
       "source_data": {
-        "dataset_name": "Security Studies",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Security Studies",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2806,16 +2806,16 @@
       }
     },
     {
-      "evaluation_name": "Sociology - EM",
+      "evaluation_name": "Sociology",
       "source_data": {
-        "dataset_name": "Sociology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Sociology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2868,16 +2868,16 @@
       }
     },
     {
-      "evaluation_name": "Virology - EM",
+      "evaluation_name": "Virology",
       "source_data": {
-        "dataset_name": "Virology",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on Virology",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
@@ -2930,16 +2930,16 @@
       }
     },
     {
-      "evaluation_name": "World Religions - EM",
+      "evaluation_name": "World Religions",
       "source_data": {
-        "dataset_name": "World Religions",
+        "dataset_name": "helm_mmlu",
         "source_type": "url",
         "url": [
           "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
         ]
       },
       "metric_config": {
-        "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
+        "evaluation_description": "EM on World Religions",
         "lower_is_better": false,
         "score_type": "continuous",
         "min_score": 0.0,
diff --git a/utils/helm/adapter.py b/utils/helm/adapter.py
index acb5330d9..a3a7aca96 100644
--- a/utils/helm/adapter.py
+++ b/utils/helm/adapter.py
@@ -193,9 +193,22 @@ def convert(
                     or "instruct" in leaderboard_name.lower()
                 )
 
+                if full_eval_name.lower().startswith('mean'):
+                    metric_name = None
+                    dataset_name = leaderboard_name
+                    evaluation_name = full_eval_name
+                else:
+                    dataset_name, metric_name = full_eval_name.split(' - ', 1)
+                    evaluation_name = dataset_name
+
+                if metric_name:
+                    evaluation_description = f'{metric_name} on {dataset_name}'
+                else:
+                    evaluation_description = header.get("description")
+
                 if is_new_metric:
                     metric_config = MetricConfig(
-                        evaluation_description=header.get("description"),
+                        evaluation_description=evaluation_description,
                         lower_is_better=header.get("lower_is_better", False),
                         min_score=(
                             0.0 if mins[col_idx] >= 0 else math.floor(mins[col_idx])
@@ -206,13 +219,10 @@ def convert(
                         score_type=ScoreType.continuous,
                     )
 
-                    if full_eval_name.lower().startswith('mean'):
-                        dataset_name = leaderboard_name
-                    else:
-                        dataset_name = full_eval_name.split(' - ')[0]
+                    source_dataset_name = leaderboard_name if leaderboard_name.lower() == 'helm_mmlu' else dataset_name
 
                     source_data = SourceDataUrl(
-                        dataset_name=dataset_name,
+                        dataset_name=source_dataset_name,
                         source_type='url',
                         url=[args.source_data_url]
                     )
@@ -224,7 +234,7 @@ def convert(
                     )
 
                     model_results[model_name][short_name] = EvaluationResult(
-                        evaluation_name=full_eval_name,
+                        evaluation_name=evaluation_name,
                         source_data=source_data,
                         metric_config=metric_config,
                         score_details=ScoreDetails(